update scripts and readme

geoelements · Jun 28, 2024 · 9a8ea12 · 9a8ea12
1 parent 66289a5
commit 9a8ea12
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -262,20 +262,10 @@ ROLLOUT_PATH="${TMP_DIR}/${DATASET_NAME}/rollout/"
 python -m gns.train --data_path=${DATA_PATH} --model_path=${MODEL_PATH} --ntraining_steps=10
 ```
 
-### Building GNS environment on TACC (LS6 and Frontera)
+### Build Docker Image
 
-- to setup a virtualenv
+Dockerfile is supplied to build image.
 
-```shell
-sh ./build_venv.sh
-```
-
-- check tests run sucessfully.
-- start your environment
-
-```shell
-source start_venv.sh 
-```
 
 ### Building GNS on MacOS
 ```shell
@@ -296,17 +286,12 @@ GNS can be trained in parallel on multiple nodes with multiple GPUs.
 > GNS scaling result on [TACC lonestar6 GPU nodes](https://docs.tacc.utexas.edu/hpc/lonestar6/#table2) with A100 GPUs.
 
 ### Usage
-#### Single Node, Multi-GPU
-```shell
-python -m torch.distributed.launch --nnodes=1  --nproc_per_node=[GPU_PER_NODE] --node_rank=[LOCAL_RANK] --master_addr=[MAIN_RANK] gns/train_multinode.py [ARGS] 
-```
 
-#### Multi-node, Multi-GPU
-On each node, run
 ```shell
-python -m torch.distributed.launch --nnodes=[NNODES]  --nproc_per_node=[GPU_PER_NODE] --node_rank=[LOCAL_RANK] --master_addr=[MAIN_RANK ]gns/train_multinode.py [ARGS] 
+mpiexec.hydra -np $NNODES -ppn 1 ../slurm_scripts/launch_helper.sh $DOCKER_IMG_LOCATION
 ```
 
+
 ### Inspiration
 PyTorch version of Graph Network Simulator and Mesh Graph Network Simulator are based on:
 * [https://arxiv.org/abs/2002.09405](https://arxiv.org/abs/2002.09405) and [https://github.com/deepmind/deepmind-research/tree/master/learning_to_simulate](https://github.com/deepmind/deepmind-research/tree/master/learning_to_simulate)

diff --git a/slurm_scripts/launch_helper.sh b/slurm_scripts/launch_helper.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+LOCAL_RANK=$PMI_RANK
+
+NODEFILE=/tmp/hostfile
+scontrol show hostnames  > $NODEFILE
+if [[ -z "${NODEFILE}" ]]; then
+    RANKS=$NODEFILE
+    NNODES=1
+else
+    MAIN_RANK=$(head -n 1 $NODEFILE)
+    RANKS=$(tr '\n' ' ' < $NODEFILE)
+    NNODES=$(< $NODEFILE wc -l)
+fi
+
+
+PRELOAD="/opt/apps/tacc-apptainer/1.1.8/bin/apptainer exec --nv $1 "
+CMD="torchrun --nproc_per_node 4 --nnodes $NNODES --node_rank=$LOCAL_RANK --master_addr=$MAIN_RANK --master_port=1234 train.py"
+
+FULL_CMD="$PRELOAD $CMD"
+echo "Training command: $FULL_CMD"
+
+eval $FULL_CMD
diff --git a/slurm_scripts/launch_train.sh b/slurm_scripts/launch_train.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#SBATCH -J train         # Job name
+#SBATCH -o train.o%j     # Name of stdout output file
+#SBATCH -e train.e%j     # Name of stderr error file
+#SBATCH -p gpu-a100              # Queue (partition) name
+#SBATCH -N 2                     # Total # of nodes (must be 1 for serial)
+#SBATCH -n 2                 # Total # of mpi tasks (should be 1 for serial)
+#SBATCH -t 0:05:00          # Run time (hh:mm:ss)
+#SBATCH --mail-type=all      # Send email at begin and end of job
+
+
+# Train for a few steps.
+NODEFILE=/tmp/hostfile
+scontrol show hostnames  > $NODEFILE
+NNODES=$(< $NODEFILE wc -l)
+
+CONTAINER=$1
+mpiexec.hydra -np $NNODES -ppn 1 ../slurm_scripts/launch_helper.sh $CONTAINER