From 78ef00ce8008b42b41afa652ded949e182499a06 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Fri, 1 Mar 2024 14:30:37 +0800 Subject: [PATCH 1/2] SMHP time sync Upstream portion of aws-samples/playground-persistent-cluster#c050891 Co-authored-by: Sean Smith (@sean-smith) Co-authored-by: Ben Snynder (@johnbensnyder) Co-authored-by: Subham Aurora --- .../base-config/lifecycle_script.py | 6 ++-- .../base-config/utils/setup_timesync.sh | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py index 8087af83..624ffd0f 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py @@ -150,7 +150,7 @@ def main(args): ExecuteBashScript("./setup_mariadb_accounting.sh").run() ExecuteBashScript("./utils/motd.sh").run(node_type) - + ExecuteBashScript("./utils/setup_timesync.sh").run() ExecuteBashScript("./utils/fsx_ubuntu.sh").run() ExecuteBashScript("./start_slurm.sh").run(node_type, ",".join(controllers)) @@ -158,13 +158,13 @@ def main(args): ## Note: Uncomment the below lines to install docker and enroot. # ExecuteBashScript("./utils/install_docker.sh").run() # ExecuteBashScript("./utils/install_enroot_pyxis.sh").run(node_type) - + # # Note: Uncomment the below lines to install DCGM Exporter and EFA Node Exporter and Cluster Nodes. (Docker must also be installed above) # if node_type == SlurmNodeType.COMPUTE_NODE: # ExecuteBashScript("./utils/install_dcgm_exporter.sh").run() # ExecuteBashScript("./utils/install_efa_node_exporter.sh").run() - # # Note: Uncomment the below lines to install Slurm Exporter and Prometheus on the Controller Node. + # # Note: Uncomment the below lines to install Slurm Exporter and Prometheus on the Controller Node. # if node_type == SlurmNodeType.HEAD_NODE: # ExecuteBashScript("./utils/install_slurm_exporter.sh").run() # ExecuteBashScript("./utils/install_prometheus.sh").run() diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh new file mode 100644 index 00000000..8553e64a --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Credits: Sean Smith, Ben Snyder, Shubham Arora + +# Consistent times across cluster is crucial for distributed workload. For example, torchrun fails +# fast when it detects 5 seconds (or more) time differences among workers. +# +# Check the time of all compute nodes as follows: +# +# srun -N bash -c 'echo "$(hostname): $(date)"' | sort -k2,3 +# +# +# To avoid time drifts, enable time synchornization (ref: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/set-time.html). + +set -exuo pipefail + +FILE=/etc/chrony/chrony.conf + +line='server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4' +grep "^${line}$" $FILE &> /dev/null \ + && echo Line \"${line}\" already exists in $FILE \ + || sed -i \ + "/\# See http:\/\/www.pool.ntp.org\/join.html for more information./a ${line}" \ + $FILE + +line='pool time.aws.com iburst' +grep "^${line}$" $FILE &> /dev/null \ + && echo Line \"${line}\" already exists in $FILE \ + || sed -i \ + "/^server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4$/a ${line}" \ + $FILE + +systemctl enable --now chrony From fc60bacdd3670b1c2bb5474b1bae1c4c01237533 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Mon, 4 Mar 2024 09:01:16 +0800 Subject: [PATCH 2/2] Add license header --- .../LifecycleScripts/base-config/utils/setup_timesync.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh index 8553e64a..e87d3b14 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/setup_timesync.sh @@ -1,7 +1,10 @@ #!/bin/bash -# Credits: Sean Smith, Ben Snyder, Shubham Arora +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# Credits: Sean Smith, Ben Snyder, Shubham Arora +# # Consistent times across cluster is crucial for distributed workload. For example, torchrun fails # fast when it detects 5 seconds (or more) time differences among workers. #