From 896e9a591125d144072c5d7ed929e4436c361e02 Mon Sep 17 00:00:00 2001
From: Simon Adamov <simon.adamov@mailbox.org>
Date: Sat, 4 May 2024 15:40:48 +0200
Subject: [PATCH 1/4] Introduces multi-node training setup

---
 train_model.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/train_model.py b/train_model.py
index 96d21a3f..8b7371a4 100644
--- a/train_model.py
+++ b/train_model.py
@@ -1,4 +1,5 @@
 # Standard library
+import os
 import random
 import time
 from argparse import ArgumentParser
@@ -230,13 +231,25 @@ def main():
     )
 
     # Instantiate model + trainer
+    if args.eval:
+        use_distributed_sampler = False
+    else:
+        use_distributed_sampler = True
+
+    devices = 1
+    num_nodes = 1
     if torch.cuda.is_available():
-        device_name = "cuda"
-        torch.set_float32_matmul_precision(
-            "high"
-        )  # Allows using Tensor Cores on A100s
+        accelerator = "cuda"
+        if "SLURM_JOB_ID" in os.environ:
+            devices = int(
+                os.environ.get(
+                    "SLURM_GPUS_PER_NODE",
+                    torch.cuda.device_count()))
+            num_nodes = int(os.environ.get("SLURM_JOB_NUM_NODES", 1))
+            # Allows using Tensor Cores on A100s
+            torch.set_float32_matmul_precision("high")
     else:
-        device_name = "cpu"
+        accelerator = "cpu"
 
     # Load model parameters Use new args for model
     model_class = MODELS[args.model]
@@ -269,8 +282,10 @@ def main():
     trainer = pl.Trainer(
         max_epochs=args.epochs,
         deterministic=True,
-        strategy="ddp",
-        accelerator=device_name,
+        accelerator=accelerator,
+        devices=devices,
+        num_nodes=num_nodes,
+        use_distributed_sampler=use_distributed_sampler,
         logger=logger,
         log_every_n_steps=1,
         callbacks=[checkpoint_callback],

From f28f7989f6c602dd236367d67a94a0233784ec82 Mon Sep 17 00:00:00 2001
From: Simon Adamov <simon.adamov@mailbox.org>
Date: Sat, 4 May 2024 21:20:41 +0200
Subject: [PATCH 2/4] eval is recommended by torch to run on one device

---
 train_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_model.py b/train_model.py
index 8b7371a4..36b9a67b 100644
--- a/train_model.py
+++ b/train_model.py
@@ -240,7 +240,7 @@ def main():
     num_nodes = 1
     if torch.cuda.is_available():
         accelerator = "cuda"
-        if "SLURM_JOB_ID" in os.environ:
+        if "SLURM_JOB_ID" in os.environ and not args.eval:
             devices = int(
                 os.environ.get(
                     "SLURM_GPUS_PER_NODE",

From f708da9680df9d52846245418805be1f33c608d3 Mon Sep 17 00:00:00 2001
From: Simon Adamov <simon.adamov@meteoswiss.ch>
Date: Mon, 13 May 2024 17:27:51 +0200
Subject: [PATCH 3/4] linter

---
 train_model.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/train_model.py b/train_model.py
index 36b9a67b..5a3bfb5b 100644
--- a/train_model.py
+++ b/train_model.py
@@ -23,7 +23,7 @@
 }
 
 
-def main():
+def main():  # pylint: disable=too-many-branches
     """
     Main function for training and evaluating models
     """
@@ -242,9 +242,8 @@ def main():
         accelerator = "cuda"
         if "SLURM_JOB_ID" in os.environ and not args.eval:
             devices = int(
-                os.environ.get(
-                    "SLURM_GPUS_PER_NODE",
-                    torch.cuda.device_count()))
+                os.environ.get("SLURM_GPUS_PER_NODE", torch.cuda.device_count())
+            )
             num_nodes = int(os.environ.get("SLURM_JOB_NUM_NODES", 1))
             # Allows using Tensor Cores on A100s
             torch.set_float32_matmul_precision("high")

From 07903d5abce939f3b2e87168c80e7848031d1ea5 Mon Sep 17 00:00:00 2001
From: Simon Adamov <simon.adamov@meteoswiss.ch>
Date: Fri, 7 Jun 2024 13:08:51 +0200
Subject: [PATCH 4/4] Added documentation and an example slurm file

---
 .gitignore                        |  1 -
 CHANGELOG.md                      |  5 +++++
 README.md                         |  9 +++++++++
 docs/examples/submit_slurm_job.sh | 17 +++++++++++++++++
 4 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 docs/examples/submit_slurm_job.sh

diff --git a/.gitignore b/.gitignore
index 65e9f6f8..af8bed85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,6 @@ graphs
 sweeps
 test_*.sh
 .vscode
-*slurm*
 
 ### Python ###
 # Byte-compiled / optimized / DLL files
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4680c37..25eb344c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [unreleased](https://github.com/joeloskarsson/neural-lam/compare/v0.1.0...HEAD)
 
 ### Added
+
+- Support for distributed training using DDP (DistributedDataParallel) and SLURM on multi-node multi-GPU setups
+  [/#26](https://github.com/mllam/neural-lam/pull/26)
+  @sadamov
+
 - Added tests for loading dataset, creating graph, and training model based on reduced MEPS dataset stored on AWS S3, along with automatic running of tests on push/PR to GitHub. Added caching of test data tp speed up running tests.
   [/#38](https://github.com/mllam/neural-lam/pull/38)
   @SimonKamuk
diff --git a/README.md b/README.md
index 1bdc6602..1e703560 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,7 @@ A few of the key ones are outlined below:
 * `--ar_steps`: Number of time steps to unroll for when making predictions and computing the loss
 
 Checkpoints of trained models are stored in the `saved_models` directory.
+
 The implemented models are:
 
 ### Graph-LAM
@@ -172,6 +173,14 @@ python train_model.py --model hi_lam_parallel --graph hierarchical ...
 
 Checkpoint files for our models trained on the MEPS data are available upon request.
 
+### High Performance Computing
+
+The training script can be run on a cluster with multiple GPU-nodes. Neural LAM is set up to use PyTorch Lightning's `DDP` backend for distributed training.
+Currently, only the SLURM (Simple Linux Utility for Resource Management) scheduler is supported.
+To run on a cluster, consider the following example script: `docs/examples/submit_slurm_job.sh`.
+This script must first be adapted to the specific requirements of the cluster and then submitted with `sbatch`.
+If SLURM is not available in the current environment, the script is run locally.
+
 ## Evaluate Models
 Evaluation is also done using `train_model.py`, but using the `--eval` option.
 Use `--eval val` to evaluate the model on the validation set and `--eval test` to evaluate on test data.
diff --git a/docs/examples/submit_slurm_job.sh b/docs/examples/submit_slurm_job.sh
new file mode 100644
index 00000000..941ebbc0
--- /dev/null
+++ b/docs/examples/submit_slurm_job.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -l
+#SBATCH --job-name=Neural-LAM
+#SBATCH --time=24:00:00
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres:gpu=4
+#SBATCH --partition=normal
+#SBATCH --mem=444G
+#SBATCH --no-requeue
+#SBATCH --exclusive
+#SBATCH --output=lightning_logs/neurallam_out_%j.log
+#SBATCH --error=lightning_logs/neurallam_err_%j.log
+
+# Load necessary modules or activate environment, for example:
+conda activate neural-lam
+
+srun -ul python train_model.py --val_interval 5 --epochs 20 --n_workers 8 --batch_size 12 --model hi_lam --graph hierarchical