Skip to content

Commit

Permalink
Merge pull request #172 from aws-samples/smhp-time-sync
Browse files Browse the repository at this point in the history
SMHP time sync
  • Loading branch information
verdimrc authored Mar 6, 2024
2 parents 8cdfe8a + fc60bac commit b428a4d
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -150,21 +150,21 @@ def main(args):
ExecuteBashScript("./setup_mariadb_accounting.sh").run()

ExecuteBashScript("./utils/motd.sh").run(node_type)

ExecuteBashScript("./utils/setup_timesync.sh").run()
ExecuteBashScript("./utils/fsx_ubuntu.sh").run()

ExecuteBashScript("./start_slurm.sh").run(node_type, ",".join(controllers))

## Note: Uncomment the below lines to install docker and enroot.
# ExecuteBashScript("./utils/install_docker.sh").run()
# ExecuteBashScript("./utils/install_enroot_pyxis.sh").run(node_type)

# # Note: Uncomment the below lines to install DCGM Exporter and EFA Node Exporter and Cluster Nodes. (Docker must also be installed above)
# if node_type == SlurmNodeType.COMPUTE_NODE:
# ExecuteBashScript("./utils/install_dcgm_exporter.sh").run()
# ExecuteBashScript("./utils/install_efa_node_exporter.sh").run()

# # Note: Uncomment the below lines to install Slurm Exporter and Prometheus on the Controller Node.
# # Note: Uncomment the below lines to install Slurm Exporter and Prometheus on the Controller Node.
# if node_type == SlurmNodeType.HEAD_NODE:
# ExecuteBashScript("./utils/install_slurm_exporter.sh").run()
# ExecuteBashScript("./utils/install_prometheus.sh").run()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Credits: Sean Smith, Ben Snyder, Shubham Arora
#
# Consistent times across cluster is crucial for distributed workload. For example, torchrun fails
# fast when it detects 5 seconds (or more) time differences among workers.
#
# Check the time of all compute nodes as follows:
#
# srun -N <NUM_OF_NODES> bash -c 'echo "$(hostname): $(date)"' | sort -k2,3
#
#
# To avoid time drifts, enable time synchornization (ref: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/set-time.html).

set -exuo pipefail

FILE=/etc/chrony/chrony.conf

line='server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4'
grep "^${line}$" $FILE &> /dev/null \
&& echo Line \"${line}\" already exists in $FILE \
|| sed -i \
"/\# See http:\/\/www.pool.ntp.org\/join.html for more information./a ${line}" \
$FILE

line='pool time.aws.com iburst'
grep "^${line}$" $FILE &> /dev/null \
&& echo Line \"${line}\" already exists in $FILE \
|| sed -i \
"/^server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4$/a ${line}" \
$FILE

systemctl enable --now chrony

0 comments on commit b428a4d

Please sign in to comment.