Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[develop] Enable workflow runs on single node linux/mac machine using rocoto. #508

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion etc/lmod-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ if [ "$L_MACHINE" = macos ]; then
module purge

elif [ "$L_MACHINE" = linux ]; then
export BASH_ENV="/usr/share/share/lmod/init/bash"
export BASH_ENV="/usr/share/lmod/lmod/init/bash"
source $BASH_ENV

module purge
Expand Down
1 change: 0 additions & 1 deletion modulefiles/build_linux_gnu.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ load("hpc")
load("hpc-python")

load("hpc-gnu")
load("openmpi")
load("hpc-openmpi")

load("srw_common")
Expand Down
1 change: 0 additions & 1 deletion modulefiles/build_macos_gnu.lua
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ load("hpc")
load("hpc-python")

load("hpc-gnu")
load("openmpi")
load("hpc-openmpi")

load("srw_common")
Expand Down
33 changes: 26 additions & 7 deletions modulefiles/wflow_linux.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,35 @@ This module sets a path to activate conda environment needed for running the UFS
whatis([===[This module sets a path for conda environment needed for running the UFS SRW App on Linux]===])

setenv("CMAKE_Platform", "linux")
setenv("VENV", pathJoin(os.getenv("HOME"), "condaenv/envs/regional_workflow"))

--[[
local ROCOTOmod="/Users/username/modules"
prepend_path("MODULEPATH", ROCOTOmod)
load(rocoto)
--]]
-- Conda initialization function
function init_conda(conda_path)
local shell=myShellType()
local conda_file
if shell == "csh" then
conda_file=pathJoin(conda_path,"etc/profile.d/conda.csh")
else
conda_file=pathJoin(conda_path,"etc/profile.d/conda.sh")
end
local mcmd="source " .. conda_file
execute{cmd=mcmd, modeA={"load"}}
end

-- initialize conda
local conda_path="/home/username/miniconda3"
init_conda(conda_path)

-- add rocoto to path
local rocoto_path="/home/username/rocoto"
prepend_path("PATH", pathJoin(rocoto_path,"bin"))

-- add fake slurm commands
local srw_path="/home/username/ufs-srweather-app"
prepend_path("PATH", pathJoin(srw_path, "ush/rocoto_fake_slurm"))

-- display conda activation message
if mode() == "load" then
LmodMsgRaw([===[Please do the following to activate conda:
> conda activate $VENV
> conda activate regional_workflow
]===])
end
33 changes: 26 additions & 7 deletions modulefiles/wflow_macos.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,36 @@ This module set a path needed to activate conda environement for running UFS SRW
whatis([===[This module activates conda environment for running the UFS SRW App on macOS]===])

setenv("CMAKE_Platform", "macos")
setenv("VENV", pathJoin(os.getenv("HOME"), "condaenv/envs/regional_workflow"))

--[[
local ROCOTOmod="/Users/username/modules"
prepend_path("MODULEPATH", ROCOTOmod)
load(rocoto)
--]]
-- Conda initialization function
function init_conda(conda_path)
local shell=myShellType()
local conda_file
if shell == "csh" then
conda_file=pathJoin(conda_path,"etc/profile.d/conda.csh")
else
conda_file=pathJoin(conda_path,"etc/profile.d/conda.sh")
end
local mcmd="source " .. conda_file
execute{cmd=mcmd, modeA={"load"}}
end

-- initialize conda
local conda_path="/Users/username/miniconda3"
init_conda(conda_path)

-- add rocoto to path
local rocoto_path="/Users/username/rocoto"
prepend_path("PATH", pathJoin(rocoto_path,"bin"))

-- add fake slurm commands
local srw_path="/Users/username/ufs-srweather-app"
prepend_path("PATH", pathJoin(srw_path, "ush/rocoto_fake_slurm"))

-- display conda activation message
if mode() == "load" then
LmodMsgRaw([===[Please do the following to activate conda virtual environment:
> conda activate $VENV "
> conda activate regional_workflow"
]===])
end

2 changes: 1 addition & 1 deletion parm/FV3LAM_wflow.xml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ tasks; and the "FCST" type is used for the RUN_FCST_TN task.

]>

<workflow realtime="F" scheduler="&SCHED;" cyclethrottle="20">
<workflow realtime="F" scheduler="&SCHED;" cyclethrottle="20" taskthrottle="{{ taskthrottle }}">
{# Double quotes are required inside the strftime! Expect an error from reading the template if using single quotes. #}
<cycledef group="at_start">{{ cdate_first_cycl.strftime("%M %H %d %m %Y *") }}</cycledef>
<cycledef group="forecast">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ platform:
workflow:
CCPP_PHYS_SUITE: FV3_GFS_2017_gfdlmp
PREDEF_GRID_NAME: RRFS_CONUS_25km
DATE_FIRST_CYCL: date --utc --date="2 days ago" +%Y%m%d00
DATE_LAST_CYCL: date --utc --date="2 days ago" +%Y%m%d00
DATE_FIRST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00
DATE_LAST_CYCL: $DATE_UTIL --utc --date="2 days ago" +%Y%m%d00
FCST_LEN_HRS: 6
PREEXISTING_DIR_METHOD: rename
task_get_extrn_ics:
Expand Down
5 changes: 5 additions & 0 deletions ush/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ platform:
# The number of cores available per node on the compute platform, now
# configurable for all platforms.
#
# TASKTHROTTLE:
# The number of active tasks run simultaneously. For linux/mac setting this
# to 1 makes sense
#
# BUILD_MOD_FN:
# Name of alternative build module file to use if using an
# unsupported platform. Is set automatically for supported machines.
Expand Down Expand Up @@ -158,6 +162,7 @@ platform:
#
WORKFLOW_MANAGER: ""
NCORES_PER_NODE: ""
TASKTHROTTLE: 1000
BUILD_MOD_FN: 'build_{{ user.MACHINE|lower() }}_{{ workflow.COMPILER }}'
WFLOW_MOD_FN: 'wflow_{{ user.MACHINE|lower() }}'
BUILD_VER_FN: 'build.ver.{{ user.MACHINE|lower() }}'
Expand Down
2 changes: 1 addition & 1 deletion ush/constants.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ constants:
#-----------------------------------------------------------------------
#
PI_GEOM: 3.14159265358979323846264338327
DEGS_PER_RADIAN: 57.2957795131
DEGS_PER_RADIAN: 57.29577951308232087679
RADIUS_EARTH: 6371200.0
#
#-----------------------------------------------------------------------
Expand Down
25 changes: 19 additions & 6 deletions ush/machine/linux.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
platform:
WORKFLOW_MANAGER: none
WORKFLOW_MANAGER: rocoto
NCORES_PER_NODE: 8
SCHED: none
RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} '
RUN_CMD_POST: 'mpirun -n 4 '
TASKTHROTTLE: 1
SCHED: slurm
CCPA_OBS_DIR: /home/username/DATA/UFS/obs_data/ccpa/proc
MRMS_OBS_DIR: /home/username/DATA/UFS/obs_data/mrms/proc
NDAS_OBS_DIR: /home/username/DATA/UFS/obs_data/ndas/proc
METPLUS_PATH: ""
MET_BIN_EXEC: bin
MET_INSTALL_DIR: ""
DOMAIN_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen
RUN_CMD_FCST: mpirun -n ${PE_MEMBER01}
RUN_CMD_POST: mpirun
RUN_CMD_SERIAL: time
RUN_CMD_UTILS: mpirun -n 4
PRE_TASK_CMDS: '{ ulimit -a; }'
RUN_CMD_UTILS: mpirun
PRE_TASK_CMDS: '{ ulimit -a; ulimit -s unlimited; }'
TEST_EXTRN_MDL_SOURCE_BASEDIR: /home/username/DATA/UFS/input_model_data
TEST_PREGEN_BASEDIR: /home/username/DATA/UFS/FV3LAM_pregen
TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /home/username/DATA/UFS/dummy_FV3GFS_sys_dir
TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /home/username/DATA/UFS/dummy_FV3GFS_sys_dir
FIXaer: /home/username/DATA/UFS/fix/fix_aer
FIXgsm: /home/username/DATA/UFS/fix/fix_am
FIXlut: /home/username/DATA/UFS/fix/fix_lut
FIXorg: /home/username/DATA/UFS/fix/fix_orog
FIXsfc: /home/username/DATA/UFS/fix/fix_sfc_climo
FIXshp: /home/username/DATA/UFS/NaturalEarth
EXTRN_MDL_DATA_STORES: aws nomads
data:
ics_lbcs:
FV3GFS: /home/username/DATA/UFS/FV3GFS
25 changes: 19 additions & 6 deletions ush/machine/macos.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
platform:
WORKFLOW_MANAGER: none
WORKFLOW_MANAGER: rocoto
NCORES_PER_NODE: 8
SCHED: none
RUN_CMD_FCST: 'mpirun -n ${PE_MEMBER01} '
RUN_CMD_POST: 'mpirun -n 4 '
TASKTHROTTLE: 1
SCHED: slurm
CCPA_OBS_DIR: /Users/username/DATA/UFS/obs_data/ccpa/proc
MRMS_OBS_DIR: /Users/username/DATA/UFS/obs_data/mrms/proc
NDAS_OBS_DIR: /Users/username/DATA/UFS/obs_data/ndas/proc
DOMAIN_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen
METPLUS_PATH: ""
MET_BIN_EXEC: bin
MET_INSTALL_DIR: ""
RUN_CMD_FCST: mpirun -n ${PE_MEMBER01}
RUN_CMD_POST: mpirun
RUN_CMD_SERIAL: time
RUN_CMD_UTILS: mpirun -n 4
PRE_TASK_CMDS: '{ ulimit -a; }'
RUN_CMD_UTILS: mpirun
PRE_TASK_CMDS: '{ ulimit -a; ulimit -s unlimited; }'
TEST_EXTRN_MDL_SOURCE_BASEDIR: /Users/username/DATA/UFS/input_model_data
TEST_PREGEN_BASEDIR: /Users/username/DATA/UFS/FV3LAM_pregen
TEST_ALT_EXTRN_MDL_SYSBASEDIR_ICS: /Users/username/DATA/UFS/dummy_FV3GFS_sys_dir
TEST_ALT_EXTRN_MDL_SYSBASEDIR_LBCS: /Users/username/DATA/UFS/dummy_FV3GFS_sys_dir
FIXaer: /Users/username/DATA/UFS/fix/fix_aer
FIXgsm: /Users/username/DATA/UFS/fix/fix_am
FIXlut: /Users/username/DATA/UFS/fix/fix_lut
FIXorg: /Users/username/DATA/UFS/fix/fix_orog
FIXsfc: /Users/username/DATA/UFS/fix/fix_sfc_climo
FIXshp: /Users/username/DATA/UFS/NaturalEarth
EXTRN_MDL_DATA_STORES: aws nomads
data:
ics_lbcs:
FV3GFS: /Users/username/DATA/UFS/FV3GFS
4 changes: 1 addition & 3 deletions ush/python_utils/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
except ModuleNotFoundError:
pass
# The rest of the formats: JSON/SHELL/INI/XML do not need
# external pakcages
# external packages
import json
import os
import re
Expand Down Expand Up @@ -398,8 +398,6 @@ def cfg_to_xml_str(cfg):
##################
# CONFIG utils
##################


def flatten_dict(dictionary, keys=None):
"""Flatten a recursive dictionary (e.g.yaml/json) to be one level deep

Expand Down
42 changes: 42 additions & 0 deletions ush/rocoto_fake_slurm/sacct
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

# Emulates slurm's sacct
if [[ "$1" = "--jobs="* ]]; then
PIDS="${1:7}"
PIDS="${PIDS//,/' '}"
elif [[ -f .job_database ]]; then
PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}')
fi

# Output info the way rocoto calls sacct
FMT="%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n"
echo "JobID|User|JobName|Partition|Priority|Submit|Start|End|NCPUS|ExitCode|State"

for pid in ${PIDS}; do

t_sub="N/A"
t_start=$t_sub
t_end=$t_sub
name=$pid
user=${USER:-user}
exitc=0
state="UNKNOWN"

v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}')
if [ ! -z "$v" ]; then
state="PENDING"
read name t_sub <<< "$v"
v=$(cat .job_database | grep "pid $pid started" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="RUNNING"
read t_start t_end<<< "$v"
fi
v=$(cat .job_database | grep "pid $pid ended" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="COMPLETED"
read t_end exitc <<< "$v"
fi
fi

printf "$FMT" $pid ${user:0:30} ${name:0:30} linux 0.1 $t_sub $t_start $t_end 1 $exitc $state
done
48 changes: 48 additions & 0 deletions ush/rocoto_fake_slurm/sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

# Emulates slurm's sbatch

FD=${1:-/dev/stdin}

#parse log file
LOG=`grep "#SBATCH -o" $FD | awk '{ print $3 }'`
if [ -z "$LOG" ]; then
LOG=/dev/null
fi

#parse time
TIM=`grep "#SBATCH -t" $FD | awk '{ print $3 }'`
if [ -z "$TIM" ]; then
SECS=
CTIM=
else
SECS=`echo $TIM | awk 'BEGIN { FS = ":" } ; { secs = $1 * 3600 + $2 * 60 + $3; print secs };'`
CTIM="timeout ${SECS}s"
fi

#parse job name
JOBNAME=`grep "#SBATCH --job-name" $FD | awk 'BEGIN { FS = "=" }; { print $2 }'`
if [ -z "$JOBNAME" ]; then
JOBNAME="default"
fi

#command
CMD="`cat $FD`"

#execute job in background
bash -c "\
ds=\$(date --utc +%Y-%m-%d:%H:%M:%S); \
de=\$(date --utc -d '$SECS sec' +%Y-%m-%d:%H:%M:%S); \
echo $JOBNAME pid \$$ started \$ds ends \$de >>.job_database; \
\
${CTIM} ${CMD} &>$LOG; \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you need to have an echo $? in here so that the status of the timeout is actually written to $LOG? I'm not seeing how the exit status is being written to $LOG for later retrieval.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exit code is retrieved from the log file (SRW specific solution) in the line below. I did try $? at first but it was reporting 0 (success) for failed jobs -- didn't investigate further. I will try again since that is a generic solution.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have now made it use $? directly. I think in my previous test I forgot to use an escape \$? without which it will always report exit code = 0.

excode=\$?; \
\
de=\$(date --utc +%Y-%m-%d:%H:%M:%S); \
echo $JOBNAME pid \$$ ended \$de exitcode \$excode >>.job_database;" &

#submission info
pid=$!
dsub=$(date --utc +%Y-%m-%d:%H:%M:%S)
echo $JOBNAME pid $pid submitted $dsub >>.job_database
echo "Submitted batch job "$pid
4 changes: 4 additions & 0 deletions ush/rocoto_fake_slurm/scancel
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Emulates slurm's scancel
exec kill -9 -$1
4 changes: 4 additions & 0 deletions ush/rocoto_fake_slurm/sinfo
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Emulates slurm's sinfo
exec lscpu
42 changes: 42 additions & 0 deletions ush/rocoto_fake_slurm/squeue
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

# Emulates slurm's squeue
if [[ "$1" = "--jobs="* ]]; then
PIDS="${1:7}"
PIDS="${PIDS//,/' '}"
elif [[ -f .job_database ]]; then
PIDS=$(cat .job_database | grep submitted | sort -u -k1,1 | awk '{print $3}')
fi

# Output info the way rocoto calls squeue
FMT="%-40s%-40s%-10s%-20s%-30s%-30s%-30s%-30s%-10s%-30s%-200s\n"
printf "$FMT" JOBID USER CPUS PARTITION SUBMIT_TIME START_TIME END_TIME PRIORITY EXIT_CODE STATE NAME

for pid in ${PIDS}; do

t_sub="N/A"
t_start=$t_sub
t_end=$t_sub
name=$pid
user=${USER:-user}
exitc=0
state="UNKNOWN"

v=$(cat .job_database | grep "pid $pid submitted" | awk '{print $1" "$5}')
if [ ! -z "$v" ]; then
state="PENDING"
read name t_sub <<< "$v"
v=$(cat .job_database | grep "pid $pid started" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="RUNNING"
read t_start t_end<<< "$v"
fi
v=$(cat .job_database | grep "pid $pid ended" | awk '{print $5" "$7}')
if [ ! -z "$v" ]; then
state="COMPLETED"
read t_end exitc <<< "$v"
fi
fi

printf "$FMT" $pid $user 1 linux $t_sub $t_start $t_end 0.1 $exitc $state $name
done
Loading