From 72274f216a82e581ba7a9c08bc3fa902e969ad72 Mon Sep 17 00:00:00 2001 From: Robert Bartel Date: Wed, 26 Jul 2023 09:44:51 -0400 Subject: [PATCH] Update entrypoint for partition config checking. Updating once more to make sure logic for checking for parallel processing is correct where needed, and then simplifying subsequent logic that can rely on results of previous check. --- docker/main/ngen/entrypoint.sh | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/docker/main/ngen/entrypoint.sh b/docker/main/ngen/entrypoint.sh index bfbac7196..c16449f01 100755 --- a/docker/main/ngen/entrypoint.sh +++ b/docker/main/ngen/entrypoint.sh @@ -49,9 +49,15 @@ OUTPUT_DATASET_DIR="${ALL_DATASET_DIR}/output/${OUTPUT_DATASET_NAME}" HYDROFABRIC_DATASET_DIR="${ALL_DATASET_DIR}/hydrofabric/${HYDROFABRIC_DATASET_NAME}" REALIZATION_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${REALIZATION_CONFIG_DATASET_NAME}" BMI_CONFIG_DATASET_DIR="${ALL_DATASET_DIR}/config/${BMI_CONFIG_DATASET_NAME}" -# Don't require a partitioning dataset when only using a single node -if [ ${MPI_NODE_COUNT:?} -gt 1 ]; then +# Check if parallel processing is in effect and partition dataset is needed by testing node count or 1st node CPU count +if [ ${MPI_NODE_COUNT:?} -gt 1 ] || [ $(echo "${MPI_HOST_STRING}" | sed 's/,//' | awk -F: '{print $2}') -gt 1 ] 2>/dev/null; then PARTITION_DATASET_DIR="${ALL_DATASET_DIR}/config/${PARTITION_DATASET_NAME:?No partition config dataset name for directory}" +# Note that, if the above test is "false" (in particular, the CPU count check) we should ensure the host string is valid +# Catch false negative due to invalid CPU count/format by taking complement of whether 1st CPU count is greater than -1 +# Any bogus value will result in the pre-complemented test being "false" +elif ! [ $(echo "${MPI_HOST_STRING}" | sed 's/,//' | awk -F: '{print $2}') -gt -1 ] 2>/dev/null ; then + echo "Error: invalid CPU count parsing for first host of MPI host string '${MPI_HOST_STRING}'" 2>&1 + exit 1 fi RUN_SENTINEL="/home/${MPI_USER}/.run_sentinel" @@ -168,17 +174,8 @@ exec_serial_ngen_run() # Sanity check that the output, hydrofabric, and config datasets are available (i.e., their directories are in place) check_for_dataset_dir "${REALIZATION_CONFIG_DATASET_DIR}" check_for_dataset_dir "${BMI_CONFIG_DATASET_DIR}" -# Require a partitioning dataset only when doing parallel, multiprocess ngen job with MPI -# Most obvious indicator of parallel processing (and simplest to check) is if there are multiple host nodes -if [ ${MPI_NODE_COUNT:?} -gt 1 ]; then - check_for_dataset_dir "${PARTITION_DATASET_DIR:?No partition dataset directory defined}" -# If using only a single node, then we must check how many CPUs the host has -elif [ $(echo "${MPI_HOST_STRING}" | sed 's/,//' | awk -F: '{print $2}') -gt 1 ] 2>/dev/null ; then - check_for_dataset_dir "${PARTITION_DATASET_DIR:?No partition dataset directory defined}" -# Also sanity check the host string format and CPU count extraction, ensuring we didn't produce a false negative above -elif [ $(echo "${MPI_HOST_STRING}" | sed 's/,//' | awk -F: '{print $2}') -ne 1 ] 2>/dev/null ; then - echo "Error: failed to extract integer CPU count for first host of MPI host string '${MPI_HOST_STRING}'" 2>&1 - exit 1 +if [ -n "${PARTITION_DATASET_DIR:-}" ]; then + check_for_dataset_dir "${PARTITION_DATASET_DIR}" fi check_for_dataset_dir "${HYDROFABRIC_DATASET_DIR}" check_for_dataset_dir "${OUTPUT_DATASET_DIR}"