diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py index 8087af83..624ffd0f 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py @@ -150,7 +150,7 @@ def main(args): ExecuteBashScript("./setup_mariadb_accounting.sh").run() ExecuteBashScript("./utils/motd.sh").run(node_type) - + ExecuteBashScript("./utils/setup_timesync.sh").run() ExecuteBashScript("./utils/fsx_ubuntu.sh").run() ExecuteBashScript("./start_slurm.sh").run(node_type, ",".join(controllers)) @@ -158,13 +158,13 @@ def main(args): ## Note: Uncomment the below lines to install docker and enroot. # ExecuteBashScript("./utils/install_docker.sh").run() # ExecuteBashScript("./utils/install_enroot_pyxis.sh").run(node_type) - + # # Note: Uncomment the below lines to install DCGM Exporter and EFA Node Exporter and Cluster Nodes. (Docker must also be installed above) # if node_type == SlurmNodeType.COMPUTE_NODE: # ExecuteBashScript("./utils/install_dcgm_exporter.sh").run() # ExecuteBashScript("./utils/install_efa_node_exporter.sh").run() - # # Note: Uncomment the below lines to install Slurm Exporter and Prometheus on the Controller Node. + # # Note: Uncomment the below lines to install Slurm Exporter and Prometheus on the Controller Node. # if node_type == SlurmNodeType.HEAD_NODE: # ExecuteBashScript("./utils/install_slurm_exporter.sh").run() # ExecuteBashScript("./utils/install_prometheus.sh").run() diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh new file mode 100644 index 00000000..e87d3b14 --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +# Credits: Sean Smith, Ben Snyder, Shubham Arora +# +# Consistent times across cluster is crucial for distributed workload. For example, torchrun fails +# fast when it detects 5 seconds (or more) time differences among workers. +# +# Check the time of all compute nodes as follows: +# +# srun -N bash -c 'echo "$(hostname): $(date)"' | sort -k2,3 +# +# +# To avoid time drifts, enable time synchornization (ref: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/set-time.html). + +set -exuo pipefail + +FILE=/etc/chrony/chrony.conf + +line='server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4' +grep "^${line}$" $FILE &> /dev/null \ + && echo Line \"${line}\" already exists in $FILE \ + || sed -i \ + "/\# See http:\/\/www.pool.ntp.org\/join.html for more information./a ${line}" \ + $FILE + +line='pool time.aws.com iburst' +grep "^${line}$" $FILE &> /dev/null \ + && echo Line \"${line}\" already exists in $FILE \ + || sed -i \ + "/^server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4$/a ${line}" \ + $FILE + +systemctl enable --now chrony