diff --git a/gcm_setup b/gcm_setup index 2dad7352..4573188f 100755 --- a/gcm_setup +++ b/gcm_setup @@ -169,8 +169,8 @@ endif setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc` - if ( `echo $BASEDIR | grep -i mvapich2` != '') then - set MPI = mvapich2 + if ( `echo $BASEDIR | grep -i mvapich` != '') then + set MPI = mvapich else if ( `echo $BASEDIR | grep -i mpich` != '') then set MPI = mpich else if ( `echo $BASEDIR | grep -i openmpi` != '') then @@ -386,10 +386,6 @@ if( $HRCODE == 'c180' | \ $HRCODE == 'c1536' | \ $HRCODE == 'c2160' ) then - set DEFAULT_DO_IOS = TRUE - echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" -# MVAPICH2 requires ioserver for history (issue with MPI_Put and MAPL) -else if( $MPI == mvapich2 ) then set DEFAULT_DO_IOS = TRUE echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" else @@ -431,29 +427,50 @@ endif ASKPROC: if ( $SITE == 'NCCS' ) then - echo "Enter the ${C1}Processor Type${CN} you wish to run on:" - echo " ${C2}sky (Skylake)${CN} (default)" - echo " ${C2}cas (Cascade Lake)${CN}" - echo " " - set MODEL = `echo $<` - set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` - if ( .$MODEL == .) then - set MODEL = 'sky' - endif - if( $MODEL != 'sky' & \ - $MODEL != 'cas' ) goto ASKPROC + set BUILT_ON_SLES15 = @BUILT_ON_SLES15@ - if ($MODEL == 'sky') then - set NCPUS_PER_NODE = 40 - else if ($MODEL == 'cas') then - # NCCS currently recommends that users do not run with - # 48 cores per node on SCU16 due to OS issues and - # recommends that CPU-intensive works run with 46 or less - # cores. As 45 is a multiple of 3, it's the best value - # that doesn't waste too much - #set NCPUS_PER_NODE = 48 - set NCPUS_PER_NODE = 45 + if ("$BUILT_ON_SLES15" == "TRUE") then + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}mil (Milan)${CN} (default)" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'mil' + endif + + if( $MODEL != 'mil' ) goto ASKPROC + + if ($MODEL == 'mil') then + # We save a couple processes for the kernel + set NCPUS_PER_NODE = 126 + endif + else + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}sky (Skylake)${CN} (default)" + echo " ${C2}cas (Cascade Lake)${CN}" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'sky' + endif + + if( $MODEL != 'sky' & \ + $MODEL != 'cas' ) goto ASKPROC + + if ($MODEL == 'sky') then + set NCPUS_PER_NODE = 40 + else if ($MODEL == 'cas') then + # NCCS currently recommends that users do not run with + # 48 cores per node on SCU16 due to OS issues and + # recommends that CPU-intensive works run with 46 or less + # cores. As 45 is a multiple of 3, it's the best value + # that doesn't waste too much + #set NCPUS_PER_NODE = 48 + set NCPUS_PER_NODE = 45 + endif endif else if ( $SITE == 'NAS' ) then @@ -1681,6 +1698,15 @@ if ( $DO_IOS == TRUE ) then # multigroup requires at least two backend pes if ($NUM_BACKEND_PES < 2) set NUM_BACKEND_PES = 2 + # Next calculate the number of frontend PEs + @ NUM_FRONTEND_PES=$NCPUS_PER_NODE - $NUM_BACKEND_PES + + # If models pes is less than frontend, then we need to reduce frontend by increasing backend + if ($MODEL_NPES < $NUM_FRONTEND_PES) then + @ NUM_FRONTEND_PES=$MODEL_NPES - 2 + @ NUM_BACKEND_PES=$NCPUS_PER_NODE - $NUM_FRONTEND_PES + endif + # Calculate the total number of nodes to request from batch @ NODES=$NUM_MODEL_NODES + $NUM_OSERVER_NODES @@ -2138,9 +2164,13 @@ cat > $HOMDIR/SETENV.commands << EOF EOF # The below settings seem to be recommended for hybrid -# systems using MVAPICH2 but could change +# systems using MVAPICH but could change + +else if( $MPI == mvapich ) then -else if( $MPI == mvapich2 ) then +# MVAPICH and GEOS has issues with restart writing. Having the +# oserver write them seems to...work +set RESTART_BY_OSERVER = YES cat > $HOMDIR/SETENV.commands << EOF setenv MV2_ENABLE_AFFINITY 0 @@ -2216,6 +2246,17 @@ EOF endif # if NOT Singularity +# Testing on SLES15 showed that the mlx provider did not seem +# to work at scale. So we move to use the verbs provider. Note: +# still seems to have issues at c720 +if ("$BUILT_ON_SLES15" == "TRUE") then +cat >> $HOMDIR/SETENV.commands << EOF +setenv I_MPI_OFI_PROVIDER verbs +setenv I_MPI_COLL_EXTERNAL 0 +EOF + +endif # if SLES15 + endif # if NCCS endif # if mpi diff --git a/geoschemchem_setup b/geoschemchem_setup index f1588847..575f8ff6 100755 --- a/geoschemchem_setup +++ b/geoschemchem_setup @@ -31,8 +31,8 @@ endif # Build Directory Locations ####################################################################### -# Set Current Working Path to geoschemchem_setup -# ---------------------------------------------- +# Set Current Working Path to gcm_setup +# ------------------------------------- setenv ARCH `uname -s` setenv NODE `uname -n` @@ -169,8 +169,8 @@ endif setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc` - if ( `echo $BASEDIR | grep -i mvapich2` != '') then - set MPI = mvapich2 + if ( `echo $BASEDIR | grep -i mvapich` != '') then + set MPI = mvapich else if ( `echo $BASEDIR | grep -i mpich` != '') then set MPI = mpich else if ( `echo $BASEDIR | grep -i openmpi` != '') then @@ -386,10 +386,6 @@ if( $HRCODE == 'c180' | \ $HRCODE == 'c1536' | \ $HRCODE == 'c2160' ) then - set DEFAULT_DO_IOS = TRUE - echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" -# MVAPICH2 requires ioserver for history (issue with MPI_Put and MAPL) -else if( $MPI == mvapich2 ) then set DEFAULT_DO_IOS = TRUE echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" else @@ -431,29 +427,50 @@ endif ASKPROC: if ( $SITE == 'NCCS' ) then - echo "Enter the ${C1}Processor Type${CN} you wish to run on:" - echo " ${C2}sky (Skylake)${CN} (default)" - echo " ${C2}cas (Cascade Lake)${CN}" - echo " " - set MODEL = `echo $<` - set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` - if ( .$MODEL == .) then - set MODEL = 'sky' - endif - if( $MODEL != 'sky' & \ - $MODEL != 'cas' ) goto ASKPROC + set BUILT_ON_SLES15 = @BUILT_ON_SLES15@ - if ($MODEL == 'sky') then - set NCPUS_PER_NODE = 40 - else if ($MODEL == 'cas') then - # NCCS currently recommends that users do not run with - # 48 cores per node on SCU16 due to OS issues and - # recommends that CPU-intensive works run with 46 or less - # cores. As 45 is a multiple of 3, it's the best value - # that doesn't waste too much - #set NCPUS_PER_NODE = 48 - set NCPUS_PER_NODE = 45 + if ("$BUILT_ON_SLES15" == "TRUE") then + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}mil (Milan)${CN} (default)" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'mil' + endif + + if( $MODEL != 'mil' ) goto ASKPROC + + if ($MODEL == 'mil') then + # We save a couple processes for the kernel + set NCPUS_PER_NODE = 126 + endif + else + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}sky (Skylake)${CN} (default)" + echo " ${C2}cas (Cascade Lake)${CN}" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'sky' + endif + + if( $MODEL != 'sky' & \ + $MODEL != 'cas' ) goto ASKPROC + + if ($MODEL == 'sky') then + set NCPUS_PER_NODE = 40 + else if ($MODEL == 'cas') then + # NCCS currently recommends that users do not run with + # 48 cores per node on SCU16 due to OS issues and + # recommends that CPU-intensive works run with 46 or less + # cores. As 45 is a multiple of 3, it's the best value + # that doesn't waste too much + #set NCPUS_PER_NODE = 48 + set NCPUS_PER_NODE = 45 + endif endif else if ( $SITE == 'NAS' ) then @@ -1711,6 +1728,15 @@ if ( $DO_IOS == TRUE ) then # multigroup requires at least two backend pes if ($NUM_BACKEND_PES < 2) set NUM_BACKEND_PES = 2 + # Next calculate the number of frontend PEs + @ NUM_FRONTEND_PES=$NCPUS_PER_NODE - $NUM_BACKEND_PES + + # If models pes is less than frontend, then we need to reduce frontend by increasing backend + if ($MODEL_NPES < $NUM_FRONTEND_PES) then + @ NUM_FRONTEND_PES=$MODEL_NPES - 2 + @ NUM_BACKEND_PES=$NCPUS_PER_NODE - $NUM_FRONTEND_PES + endif + # Calculate the total number of nodes to request from batch @ NODES=$NUM_MODEL_NODES + $NUM_OSERVER_NODES @@ -2168,9 +2194,13 @@ cat > $HOMDIR/SETENV.commands << EOF EOF # The below settings seem to be recommended for hybrid -# systems using MVAPICH2 but could change +# systems using MVAPICH but could change + +else if( $MPI == mvapich ) then -else if( $MPI == mvapich2 ) then +# MVAPICH and GEOS has issues with restart writing. Having the +# oserver write them seems to...work +set RESTART_BY_OSERVER = YES cat > $HOMDIR/SETENV.commands << EOF setenv MV2_ENABLE_AFFINITY 0 @@ -2246,6 +2276,17 @@ EOF endif # if NOT Singularity +# Testing on SLES15 showed that the mlx provider did not seem +# to work at scale. So we move to use the verbs provider. Note: +# still seems to have issues at c720 +if ("$BUILT_ON_SLES15" == "TRUE") then +cat >> $HOMDIR/SETENV.commands << EOF +setenv I_MPI_OFI_PROVIDER verbs +setenv I_MPI_COLL_EXTERNAL 0 +EOF + +endif # if SLES15 + endif # if NCCS endif # if mpi diff --git a/gmichem_setup b/gmichem_setup index 884514c6..72c25fa1 100755 --- a/gmichem_setup +++ b/gmichem_setup @@ -169,8 +169,8 @@ endif setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc` - if ( `echo $BASEDIR | grep -i mvapich2` != '') then - set MPI = mvapich2 + if ( `echo $BASEDIR | grep -i mvapich` != '') then + set MPI = mvapich else if ( `echo $BASEDIR | grep -i mpich` != '') then set MPI = mpich else if ( `echo $BASEDIR | grep -i openmpi` != '') then @@ -386,10 +386,6 @@ if( $HRCODE == 'c180' | \ $HRCODE == 'c1536' | \ $HRCODE == 'c2160' ) then - set DEFAULT_DO_IOS = TRUE - echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" -# MVAPICH2 requires ioserver for history (issue with MPI_Put and MAPL) -else if( $MPI == mvapich2 ) then set DEFAULT_DO_IOS = TRUE echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" else @@ -431,29 +427,50 @@ endif ASKPROC: if ( $SITE == 'NCCS' ) then - echo "Enter the ${C1}Processor Type${CN} you wish to run on:" - echo " ${C2}sky (Skylake)${CN} (default)" - echo " ${C2}cas (Cascade Lake)${CN}" - echo " " - set MODEL = `echo $<` - set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` - if ( .$MODEL == .) then - set MODEL = 'sky' - endif - if( $MODEL != 'sky' & \ - $MODEL != 'cas' ) goto ASKPROC + set BUILT_ON_SLES15 = @BUILT_ON_SLES15@ - if ($MODEL == 'sky') then - set NCPUS_PER_NODE = 40 - else if ($MODEL == 'cas') then - # NCCS currently recommends that users do not run with - # 48 cores per node on SCU16 due to OS issues and - # recommends that CPU-intensive works run with 46 or less - # cores. As 45 is a multiple of 3, it's the best value - # that doesn't waste too much - #set NCPUS_PER_NODE = 48 - set NCPUS_PER_NODE = 45 + if ("$BUILT_ON_SLES15" == "TRUE") then + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}mil (Milan)${CN} (default)" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'mil' + endif + + if( $MODEL != 'mil' ) goto ASKPROC + + if ($MODEL == 'mil') then + # We save a couple processes for the kernel + set NCPUS_PER_NODE = 126 + endif + else + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}sky (Skylake)${CN} (default)" + echo " ${C2}cas (Cascade Lake)${CN}" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'sky' + endif + + if( $MODEL != 'sky' & \ + $MODEL != 'cas' ) goto ASKPROC + + if ($MODEL == 'sky') then + set NCPUS_PER_NODE = 40 + else if ($MODEL == 'cas') then + # NCCS currently recommends that users do not run with + # 48 cores per node on SCU16 due to OS issues and + # recommends that CPU-intensive works run with 46 or less + # cores. As 45 is a multiple of 3, it's the best value + # that doesn't waste too much + #set NCPUS_PER_NODE = 48 + set NCPUS_PER_NODE = 45 + endif endif else if ( $SITE == 'NAS' ) then @@ -1872,6 +1889,15 @@ if ( $DO_IOS == TRUE ) then # multigroup requires at least two backend pes if ($NUM_BACKEND_PES < 2) set NUM_BACKEND_PES = 2 + # Next calculate the number of frontend PEs + @ NUM_FRONTEND_PES=$NCPUS_PER_NODE - $NUM_BACKEND_PES + + # If models pes is less than frontend, then we need to reduce frontend by increasing backend + if ($MODEL_NPES < $NUM_FRONTEND_PES) then + @ NUM_FRONTEND_PES=$MODEL_NPES - 2 + @ NUM_BACKEND_PES=$NCPUS_PER_NODE - $NUM_FRONTEND_PES + endif + # Calculate the total number of nodes to request from batch @ NODES=$NUM_MODEL_NODES + $NUM_OSERVER_NODES @@ -2335,9 +2361,13 @@ cat > $HOMDIR/SETENV.commands << EOF EOF # The below settings seem to be recommended for hybrid -# systems using MVAPICH2 but could change +# systems using MVAPICH but could change + +else if( $MPI == mvapich ) then -else if( $MPI == mvapich2 ) then +# MVAPICH and GEOS has issues with restart writing. Having the +# oserver write them seems to...work +set RESTART_BY_OSERVER = YES cat > $HOMDIR/SETENV.commands << EOF setenv MV2_ENABLE_AFFINITY 0 @@ -2413,6 +2443,17 @@ EOF endif # if NOT Singularity +# Testing on SLES15 showed that the mlx provider did not seem +# to work at scale. So we move to use the verbs provider. Note: +# still seems to have issues at c720 +if ("$BUILT_ON_SLES15" == "TRUE") then +cat >> $HOMDIR/SETENV.commands << EOF +setenv I_MPI_OFI_PROVIDER verbs +setenv I_MPI_COLL_EXTERNAL 0 +EOF + +endif # if SLES15 + endif # if NCCS endif # if mpi diff --git a/stratchem_setup b/stratchem_setup index f0ca3ac4..f1db4df1 100755 --- a/stratchem_setup +++ b/stratchem_setup @@ -169,8 +169,8 @@ endif setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc` - if ( `echo $BASEDIR | grep -i mvapich2` != '') then - set MPI = mvapich2 + if ( `echo $BASEDIR | grep -i mvapich` != '') then + set MPI = mvapich else if ( `echo $BASEDIR | grep -i mpich` != '') then set MPI = mpich else if ( `echo $BASEDIR | grep -i openmpi` != '') then @@ -386,10 +386,6 @@ if( $HRCODE == 'c180' | \ $HRCODE == 'c1536' | \ $HRCODE == 'c2160' ) then - set DEFAULT_DO_IOS = TRUE - echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" -# MVAPICH2 requires ioserver for history (issue with MPI_Put and MAPL) -else if( $MPI == mvapich2 ) then set DEFAULT_DO_IOS = TRUE echo "Do you wish to ${C1}IOSERVER${CN}? (Default: ${C2}YES${CN} or ${C2}TRUE${CN})" else @@ -431,29 +427,50 @@ endif ASKPROC: if ( $SITE == 'NCCS' ) then - echo "Enter the ${C1}Processor Type${CN} you wish to run on:" - echo " ${C2}sky (Skylake)${CN} (default)" - echo " ${C2}cas (Cascade Lake)${CN}" - echo " " - set MODEL = `echo $<` - set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` - if ( .$MODEL == .) then - set MODEL = 'sky' - endif - if( $MODEL != 'sky' & \ - $MODEL != 'cas' ) goto ASKPROC + set BUILT_ON_SLES15 = @BUILT_ON_SLES15@ - if ($MODEL == 'sky') then - set NCPUS_PER_NODE = 40 - else if ($MODEL == 'cas') then - # NCCS currently recommends that users do not run with - # 48 cores per node on SCU16 due to OS issues and - # recommends that CPU-intensive works run with 46 or less - # cores. As 45 is a multiple of 3, it's the best value - # that doesn't waste too much - #set NCPUS_PER_NODE = 48 - set NCPUS_PER_NODE = 45 + if ("$BUILT_ON_SLES15" == "TRUE") then + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}mil (Milan)${CN} (default)" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'mil' + endif + + if( $MODEL != 'mil' ) goto ASKPROC + + if ($MODEL == 'mil') then + # We save a couple processes for the kernel + set NCPUS_PER_NODE = 126 + endif + else + echo "Enter the ${C1}Processor Type${CN} you wish to run on:" + echo " ${C2}sky (Skylake)${CN} (default)" + echo " ${C2}cas (Cascade Lake)${CN}" + echo " " + set MODEL = `echo $<` + set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` + if ( .$MODEL == .) then + set MODEL = 'sky' + endif + + if( $MODEL != 'sky' & \ + $MODEL != 'cas' ) goto ASKPROC + + if ($MODEL == 'sky') then + set NCPUS_PER_NODE = 40 + else if ($MODEL == 'cas') then + # NCCS currently recommends that users do not run with + # 48 cores per node on SCU16 due to OS issues and + # recommends that CPU-intensive works run with 46 or less + # cores. As 45 is a multiple of 3, it's the best value + # that doesn't waste too much + #set NCPUS_PER_NODE = 48 + set NCPUS_PER_NODE = 45 + endif endif else if ( $SITE == 'NAS' ) then @@ -1696,6 +1713,15 @@ if ( $DO_IOS == TRUE ) then # multigroup requires at least two backend pes if ($NUM_BACKEND_PES < 2) set NUM_BACKEND_PES = 2 + # Next calculate the number of frontend PEs + @ NUM_FRONTEND_PES=$NCPUS_PER_NODE - $NUM_BACKEND_PES + + # If models pes is less than frontend, then we need to reduce frontend by increasing backend + if ($MODEL_NPES < $NUM_FRONTEND_PES) then + @ NUM_FRONTEND_PES=$MODEL_NPES - 2 + @ NUM_BACKEND_PES=$NCPUS_PER_NODE - $NUM_FRONTEND_PES + endif + # Calculate the total number of nodes to request from batch @ NODES=$NUM_MODEL_NODES + $NUM_OSERVER_NODES @@ -2153,9 +2179,13 @@ cat > $HOMDIR/SETENV.commands << EOF EOF # The below settings seem to be recommended for hybrid -# systems using MVAPICH2 but could change +# systems using MVAPICH but could change + +else if( $MPI == mvapich ) then -else if( $MPI == mvapich2 ) then +# MVAPICH and GEOS has issues with restart writing. Having the +# oserver write them seems to...work +set RESTART_BY_OSERVER = YES cat > $HOMDIR/SETENV.commands << EOF setenv MV2_ENABLE_AFFINITY 0 @@ -2231,6 +2261,17 @@ EOF endif # if NOT Singularity +# Testing on SLES15 showed that the mlx provider did not seem +# to work at scale. So we move to use the verbs provider. Note: +# still seems to have issues at c720 +if ("$BUILT_ON_SLES15" == "TRUE") then +cat >> $HOMDIR/SETENV.commands << EOF +setenv I_MPI_OFI_PROVIDER verbs +setenv I_MPI_COLL_EXTERNAL 0 +EOF + +endif # if SLES15 + endif # if NCCS endif # if mpi