diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh index b41b09ba..c8fd2b9e 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/start_slurm.sh @@ -13,20 +13,24 @@ main() { echo "[INFO] START: Starting Slurm daemons" if [[ $1 == "controller" ]]; then - echo "[INFO] This is a Controller node. Start slurm controller daemon..." + echo "[INFO] This is a Controller node. Start slurm controller daemon..." - systemctl enable --now slurmctld - fi + systemctl enable --now slurmctld - if [[ $1 == "compute" ]] || [[ $1 == "login" ]]; then + mv /etc/systemd/system/slurmd{,_DO_NOT_START_ON_CONTROLLER}.service \ + || { echo "Failed to mask slurmd, perhaps the AMI already masked it?" ; } + elif [[ $1 == "compute" ]] || [[ $1 == "login" ]]; then echo "[INFO] Running on $1 node. Start slurm daemon..." + # Login nodes must still restart slurmd to fetch slurm.conf to /var/spool/slurmd/, however + # slurmd won't run because slurm.conf does not contain login nodes. SLURMD_OPTIONS="--conf-server $CONTROLLER_IP_VALUES" envsubst < /etc/systemd/system/slurmd.service > slurmd.service mv slurmd.service /etc/systemd/system/ - systemctl daemon-reload - systemctl enable slurmd - systemctl restart slurmd + systemctl enable --now slurmd + + mv /etc/systemd/system/slurmctld{,_DO_NOT_START_ON_CONTROLLER}.service \ + || { echo "Failed to mask slurmctldd, perhaps the AMI already masked it?" ; } fi echo "[INFO] Start Slurm Script completed"