From a964e435532699908e5750abdb027ae583ff793d Mon Sep 17 00:00:00 2001
From: Aviv Keshet <akeshet@gmail.com>
Date: Tue, 17 Dec 2024 09:33:09 -0800
Subject: [PATCH 1/2] Fix --enable_each_rank_log when used with PDSH multi-node
 runner (#6863)

This PR addresses fixes
https://github.com/microsoft/DeepSpeed/issues/6859 by threading this
argument into the deepspeed launcher command build by PDSHRunner.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/launcher/multinode_runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index 74d20a6d53e5..fe2fa1b476be 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -104,6 +104,8 @@ def get_cmd(self, environment, active_resources):
             deepspeed_launch.append("--no_local_rank")
         if self.args.save_pid:
             deepspeed_launch += ["--save_pid", f"{os.getpid()}"]
+        if self.args.enable_each_rank_log:
+            deepspeed_launch.append(f"--enable_each_rank_log={self.args.enable_each_rank_log}")
         if self.args.elastic_training:
             deepspeed_launch.append("--enable_elastic_training")
             deepspeed_launch.append(f"--max_elastic_nodes={self.args.max_elastic_nodes}")

From 2f32966b1cd874aa4373177c8f8c4214ad57d020 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 17 Dec 2024 11:53:47 -0800
Subject: [PATCH 2/2] Update transformers ops unit tests to use
 `requried_torch_version` (#6884)

---
 .../ops/transformer/inference/test_bias_geglu.py   |  2 --
 .../ops/transformer/inference/test_bias_gelu.py    |  2 --
 .../ops/transformer/inference/test_bias_relu.py    |  2 --
 tests/unit/ops/transformer/inference/test_gelu.py  | 14 +++++---------
 .../unit/ops/transformer/inference/test_matmul.py  |  1 -
 .../unit/ops/transformer/inference/test_softmax.py |  2 --
 6 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index 05de4fbb4cf8..c995d2a8c46d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -15,8 +15,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def run_bias_geglu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index b69030e87ace..e3a3bad63961 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -16,8 +16,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def run_bias_gelu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index 57134665b241..69078f9f7646 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -15,8 +15,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def run_bias_relu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 5f820ef3b579..a58abfdb100c 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -9,12 +9,11 @@
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.ops.transformer import DeepSpeedInferenceConfig
 from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
+from deepspeed.utils.torch import required_torch_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def allclose(x, y):
     assert x.dtype == y.dtype
@@ -23,14 +22,11 @@ def allclose(x, y):
 
 
 def version_appropriate_gelu(activations):
-    global torch_minor_version
-    if torch_minor_version is None:
-        torch_minor_version = int(torch.__version__.split('.')[1])
-    # If torch version = 1.12
-    if torch_minor_version < 12:
-        return torch.nn.functional.gelu(activations)
-    else:
+    # gelu behavior changes (correctly) in torch 1.12
+    if required_torch_version(min_version=1.12):
         return torch.nn.functional.gelu(activations, approximate='tanh')
+    else:
+        return torch.nn.functional.gelu(activations)
 
 
 def run_gelu_reference(activations):
diff --git a/tests/unit/ops/transformer/inference/test_matmul.py b/tests/unit/ops/transformer/inference/test_matmul.py
index 559aa2c60afe..2ab195ee0115 100644
--- a/tests/unit/ops/transformer/inference/test_matmul.py
+++ b/tests/unit/ops/transformer/inference/test_matmul.py
@@ -12,7 +12,6 @@
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
-torch_minor_version = None
 
 
 def allclose(x, y):
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
index e582be1b926a..83785ac38ebb 100644
--- a/tests/unit/ops/transformer/inference/test_softmax.py
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -11,8 +11,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def allclose(x, y):
     assert x.dtype == y.dtype