Skip to content

Commit

Permalink
Merge pull request #522 from GEOS-ESM/feature/mathomp4/updates-for-milan
Browse files Browse the repository at this point in the history
Updates for SCU17
  • Loading branch information
sdrabenh authored Oct 18, 2023
2 parents 566e879 + d8a29ef commit 826aa0c
Show file tree
Hide file tree
Showing 4 changed files with 282 additions and 118 deletions.
99 changes: 70 additions & 29 deletions gcm_setup
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ endif

setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc`

if ( `echo $BASEDIR | grep -i mvapich2` != '') then
set MPI = mvapich2
if ( `echo $BASEDIR | grep -i mvapich` != '') then
set MPI = mvapich
else if ( `echo $BASEDIR | grep -i mpich` != '') then
set MPI = mpich
else if ( `echo $BASEDIR | grep -i openmpi` != '') then
Expand Down Expand Up @@ -386,10 +386,6 @@ if( $HRCODE == 'c180' | \
$HRCODE == 'c1536' | \
$HRCODE == 'c2160' ) then

set DEFAULT_DO_IOS = TRUE
echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})"
# MVAPICH2 requires ioserver for history (issue with MPI_Put and MAPL)
else if( $MPI == mvapich2 ) then
set DEFAULT_DO_IOS = TRUE
echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})"
else
Expand Down Expand Up @@ -431,29 +427,50 @@ endif
ASKPROC:

if ( $SITE == 'NCCS' ) then
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}sky (Skylake)${CN} (default)"
echo " ${C2}cas (Cascade Lake)${CN}"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
if ( .$MODEL == .) then
set MODEL = 'sky'
endif

if( $MODEL != 'sky' & \
$MODEL != 'cas' ) goto ASKPROC
set BUILT_ON_SLES15 = @BUILT_ON_SLES15@

if ($MODEL == 'sky') then
set NCPUS_PER_NODE = 40
else if ($MODEL == 'cas') then
# NCCS currently recommends that users do not run with
# 48 cores per node on SCU16 due to OS issues and
# recommends that CPU-intensive works run with 46 or less
# cores. As 45 is a multiple of 3, it's the best value
# that doesn't waste too much
#set NCPUS_PER_NODE = 48
set NCPUS_PER_NODE = 45
if ("$BUILT_ON_SLES15" == "TRUE") then
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}mil (Milan)${CN} (default)"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
if ( .$MODEL == .) then
set MODEL = 'mil'
endif

if( $MODEL != 'mil' ) goto ASKPROC

if ($MODEL == 'mil') then
# We save a couple processes for the kernel
set NCPUS_PER_NODE = 126
endif
else
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}sky (Skylake)${CN} (default)"
echo " ${C2}cas (Cascade Lake)${CN}"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
if ( .$MODEL == .) then
set MODEL = 'sky'
endif

if( $MODEL != 'sky' & \
$MODEL != 'cas' ) goto ASKPROC

if ($MODEL == 'sky') then
set NCPUS_PER_NODE = 40
else if ($MODEL == 'cas') then
# NCCS currently recommends that users do not run with
# 48 cores per node on SCU16 due to OS issues and
# recommends that CPU-intensive works run with 46 or less
# cores. As 45 is a multiple of 3, it's the best value
# that doesn't waste too much
#set NCPUS_PER_NODE = 48
set NCPUS_PER_NODE = 45
endif
endif

else if ( $SITE == 'NAS' ) then
Expand Down Expand Up @@ -1681,6 +1698,15 @@ if ( $DO_IOS == TRUE ) then
# multigroup requires at least two backend pes
if ($NUM_BACKEND_PES < 2) set NUM_BACKEND_PES = 2

# Next calculate the number of frontend PEs
@ NUM_FRONTEND_PES=$NCPUS_PER_NODE - $NUM_BACKEND_PES

# If models pes is less than frontend, then we need to reduce frontend by increasing backend
if ($MODEL_NPES < $NUM_FRONTEND_PES) then
@ NUM_FRONTEND_PES=$MODEL_NPES - 2
@ NUM_BACKEND_PES=$NCPUS_PER_NODE - $NUM_FRONTEND_PES
endif

# Calculate the total number of nodes to request from batch
@ NODES=$NUM_MODEL_NODES + $NUM_OSERVER_NODES

Expand Down Expand Up @@ -2138,9 +2164,13 @@ cat > $HOMDIR/SETENV.commands << EOF
EOF

# The below settings seem to be recommended for hybrid
# systems using MVAPICH2 but could change
# systems using MVAPICH but could change

else if( $MPI == mvapich ) then

else if( $MPI == mvapich2 ) then
# MVAPICH and GEOS has issues with restart writing. Having the
# oserver write them seems to...work
set RESTART_BY_OSERVER = YES

cat > $HOMDIR/SETENV.commands << EOF
setenv MV2_ENABLE_AFFINITY 0
Expand Down Expand Up @@ -2216,6 +2246,17 @@ EOF

endif # if NOT Singularity

# Testing on SLES15 showed that the mlx provider did not seem
# to work at scale. So we move to use the verbs provider. Note:
# still seems to have issues at c720
if ("$BUILT_ON_SLES15" == "TRUE") then
cat >> $HOMDIR/SETENV.commands << EOF
setenv I_MPI_OFI_PROVIDER verbs
setenv I_MPI_COLL_EXTERNAL 0
EOF

endif # if SLES15

endif # if NCCS

endif # if mpi
Expand Down
103 changes: 72 additions & 31 deletions geoschemchem_setup
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ endif
# Build Directory Locations
#######################################################################

# Set Current Working Path to geoschemchem_setup
# ----------------------------------------------
# Set Current Working Path to gcm_setup
# -------------------------------------
setenv ARCH `uname -s`
setenv NODE `uname -n`

Expand Down Expand Up @@ -169,8 +169,8 @@ endif

setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc`

if ( `echo $BASEDIR | grep -i mvapich2` != '') then
set MPI = mvapich2
if ( `echo $BASEDIR | grep -i mvapich` != '') then
set MPI = mvapich
else if ( `echo $BASEDIR | grep -i mpich` != '') then
set MPI = mpich
else if ( `echo $BASEDIR | grep -i openmpi` != '') then
Expand Down Expand Up @@ -386,10 +386,6 @@ if( $HRCODE == 'c180' | \
$HRCODE == 'c1536' | \
$HRCODE == 'c2160' ) then

set DEFAULT_DO_IOS = TRUE
echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})"
# MVAPICH2 requires ioserver for history (issue with MPI_Put and MAPL)
else if( $MPI == mvapich2 ) then
set DEFAULT_DO_IOS = TRUE
echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})"
else
Expand Down Expand Up @@ -431,29 +427,50 @@ endif
ASKPROC:

if ( $SITE == 'NCCS' ) then
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}sky (Skylake)${CN} (default)"
echo " ${C2}cas (Cascade Lake)${CN}"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
if ( .$MODEL == .) then
set MODEL = 'sky'
endif

if( $MODEL != 'sky' & \
$MODEL != 'cas' ) goto ASKPROC
set BUILT_ON_SLES15 = @BUILT_ON_SLES15@

if ($MODEL == 'sky') then
set NCPUS_PER_NODE = 40
else if ($MODEL == 'cas') then
# NCCS currently recommends that users do not run with
# 48 cores per node on SCU16 due to OS issues and
# recommends that CPU-intensive works run with 46 or less
# cores. As 45 is a multiple of 3, it's the best value
# that doesn't waste too much
#set NCPUS_PER_NODE = 48
set NCPUS_PER_NODE = 45
if ("$BUILT_ON_SLES15" == "TRUE") then
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}mil (Milan)${CN} (default)"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
if ( .$MODEL == .) then
set MODEL = 'mil'
endif

if( $MODEL != 'mil' ) goto ASKPROC

if ($MODEL == 'mil') then
# We save a couple processes for the kernel
set NCPUS_PER_NODE = 126
endif
else
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}sky (Skylake)${CN} (default)"
echo " ${C2}cas (Cascade Lake)${CN}"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
if ( .$MODEL == .) then
set MODEL = 'sky'
endif

if( $MODEL != 'sky' & \
$MODEL != 'cas' ) goto ASKPROC

if ($MODEL == 'sky') then
set NCPUS_PER_NODE = 40
else if ($MODEL == 'cas') then
# NCCS currently recommends that users do not run with
# 48 cores per node on SCU16 due to OS issues and
# recommends that CPU-intensive works run with 46 or less
# cores. As 45 is a multiple of 3, it's the best value
# that doesn't waste too much
#set NCPUS_PER_NODE = 48
set NCPUS_PER_NODE = 45
endif
endif

else if ( $SITE == 'NAS' ) then
Expand Down Expand Up @@ -1711,6 +1728,15 @@ if ( $DO_IOS == TRUE ) then
# multigroup requires at least two backend pes
if ($NUM_BACKEND_PES < 2) set NUM_BACKEND_PES = 2

# Next calculate the number of frontend PEs
@ NUM_FRONTEND_PES=$NCPUS_PER_NODE - $NUM_BACKEND_PES

# If models pes is less than frontend, then we need to reduce frontend by increasing backend
if ($MODEL_NPES < $NUM_FRONTEND_PES) then
@ NUM_FRONTEND_PES=$MODEL_NPES - 2
@ NUM_BACKEND_PES=$NCPUS_PER_NODE - $NUM_FRONTEND_PES
endif

# Calculate the total number of nodes to request from batch
@ NODES=$NUM_MODEL_NODES + $NUM_OSERVER_NODES

Expand Down Expand Up @@ -2168,9 +2194,13 @@ cat > $HOMDIR/SETENV.commands << EOF
EOF

# The below settings seem to be recommended for hybrid
# systems using MVAPICH2 but could change
# systems using MVAPICH but could change

else if( $MPI == mvapich ) then

else if( $MPI == mvapich2 ) then
# MVAPICH and GEOS has issues with restart writing. Having the
# oserver write them seems to...work
set RESTART_BY_OSERVER = YES

cat > $HOMDIR/SETENV.commands << EOF
setenv MV2_ENABLE_AFFINITY 0
Expand Down Expand Up @@ -2246,6 +2276,17 @@ EOF

endif # if NOT Singularity

# Testing on SLES15 showed that the mlx provider did not seem
# to work at scale. So we move to use the verbs provider. Note:
# still seems to have issues at c720
if ("$BUILT_ON_SLES15" == "TRUE") then
cat >> $HOMDIR/SETENV.commands << EOF
setenv I_MPI_OFI_PROVIDER verbs
setenv I_MPI_COLL_EXTERNAL 0
EOF

endif # if SLES15

endif # if NCCS

endif # if mpi
Expand Down
Loading

0 comments on commit 826aa0c

Please sign in to comment.