Merge branch 'main' into 117_hccl

Lightning-AI · Oct 17, 2024 · 1c4f577 · 1c4f577
2 parents a082c9d + 1c4e7e3
commit 1c4f577
Show file tree

Hide file tree

Showing 11 changed files with 48 additions and 38 deletions.
diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml
@@ -30,19 +30,19 @@ jobs:
     strategy:
       matrix:
         'w. pytorch-lightning | pypi':
-          image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
+          image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
           dependency: "pytorch-lightning"
           pkg_source: "pypi"
         'w. pytorch-lightning | source':
-          image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
+          image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
           dependency: "pytorch-lightning"
           pkg_source: "source"
         'w. lightning | pypi':
-          image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
+          image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
           dependency: "lightning"
           pkg_source: "pypi"
         'w. lightning | source':
-          image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
+          image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
           dependency: "lightning"
           pkg_source: "source"
     pool: "intel-hpus"
@@ -58,7 +58,7 @@ jobs:
     variables:
       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
       MODULE_ID: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
-      DEEPSPEED_VERSION: "1.17.0"
+      DEEPSPEED_VERSION: "1.18.0"
 
     workspace:
       clean: all
@@ -107,7 +107,7 @@ jobs:
           tests/test_fabric/test_accelerator.py \
           tests/test_fabric/test_strategy.py \
           tests/test_fabric/test_precision.py \
-          --hpus 1 --junitxml=hpu_test-fabric-results.xml
+          --hpus 1 -W ignore::FutureWarning --junitxml=hpu_test-fabric-results.xml
         python -m pytest -sv \
           tests/test_pytorch/test_accelerator.py \
           tests/test_pytorch/test_hpu_graphs.py \
@@ -117,13 +117,13 @@ jobs:
           tests/test_pytorch/test_precision.py \
           tests/test_pytorch/strategies/test_hpu_parallel.py \
           tests/test_pytorch/strategies/test_hpu_ddp.py \
-          --hpus 1 -m "not standalone_only" \
+          --hpus 1 -W ignore::FutureWarning -m "not standalone_only" \
           --junitxml=hpu_test-torch-results.xml
       displayName: 'HPU General tests'
 
     - bash: |
         python -m pytest -sv tests/test_pytorch/test_compile.py \
-          --hpus 1 --junitxml=hpu_compile_test-results.xml
+          --hpus 1 -W ignore::FutureWarning --junitxml=hpu_compile_test-results.xml
       env:
         PT_HPU_LAZY_MODE: 0
       displayName: 'HPU torch compile tests'
@@ -139,13 +139,13 @@ jobs:
 
     - bash: |
         python -m pytest -sv tests/test_pytorch/strategies/test_deepspeed.py \
-        -m "not standalone_only" --junitxml=hpu_deepspeed_test-results.xml
+        -m "not standalone_only" -W ignore::FutureWarning --junitxml=hpu_deepspeed_test-results.xml
       displayName: 'HPU Deepspeed tests'
 
     - bash: |
         python -m pytest -sv tests/test_pytorch/test_precision.py \
           -k test_autocast_operators_override --runxfail \
-          --junitxml=hpu_precision_test_override-results.xml
+          -W ignore::FutureWarning --junitxml=hpu_precision_test_override-results.xml
       env:
         LOWER_LIST: tests/test_pytorch/ops_fp32.txt
         FP32_LIST: tests/test_pytorch/ops_bf16.txt

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
         #args: ["--write-changes"] # uncomment if you want to get automatic fixing
 
   - repo: https://github.com/PyCQA/docformatter
-    rev: v1.7.5
+    rev: 06907d0267368b49b9180eed423fae5697c1e909 # todo: fix for docformatter after last 1.7.5
     hooks:
       - id: docformatter
         additional_dependencies: [tomli]

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [1.7.0] - 2024-09-DD
+## [1.7.0] - 2024-10-DD
 
 ### Added
 
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Modified torch device specification for FSDP on HPU ([#222](https://github.com/Lightning-AI/lightning-Habana/pull/222))
 - Updated strategy to use default fork ([#234](https://github.com/Lightning-AI/lightning-Habana/pull/234))
 - Updated hpu parallel strategy as base class ([#237](https://github.com/Lightning-AI/lightning-Habana/pull/237))
+- Updated to Intel Gaudi software Release 1.18.0 ([#245](https://github.com/Lightning-AI/lightning-Habana/pull/245))
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -63,12 +63,12 @@ The `devices>1` parameter with HPUs enables the Habana accelerator for distribut
 
 # Support Matrix
 
-| **SynapseAI**         | **1.17.0**                                          |
+| **SynapseAI**         | **1.18.0**                                          |
 | --------------------- | --------------------------------------------------- |
-| PyTorch               | 2.3.1                                               |
+| PyTorch               | 2.4.0                                               |
 | (PyTorch) Lightning\* | 2.4.x                                               |
 | **Lightning Habana**  | **1.7.0**                                           |
-| DeepSpeed\*\*         | Forked from v0.14.0 of the official DeepSpeed repo. |
+| DeepSpeed\*\*         | Forked from v0.14.4 of the official DeepSpeed repo. |
 
 \* covers both packages [`lightning`](https://pypi.org/project/lightning/) and [`pytorch-lightning`](https://pypi.org/project/pytorch-lightning/)
 

diff --git a/src/lightning_habana/pytorch/plugins/deepspeed_precision.py b/src/lightning_habana/pytorch/plugins/deepspeed_precision.py
@@ -45,8 +45,8 @@
     from habana_frameworks.torch.hpex.experimental.transformer_engine.recipe import DelayedScaling
 
 _HPU_DEEPSPEED_AVAILABLE = (
-    # HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0
-    RequirementCache("deepspeed==0.14.0+hpu.synapse.v1.17.0")
+    # HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+    RequirementCache("deepspeed==0.14.4+hpu.synapse.v1.18.0")
 )
 if _HPU_DEEPSPEED_AVAILABLE:
     import deepspeed
@@ -76,7 +76,7 @@ def __init__(
         if not _HPU_DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
                 "To use the `HPUDeepSpeedPrecisionPlugin`, you must have hpu DeepSpeed installed."
-                " Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0`."
+                " Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
             )
         super().__init__(device=device, precision=precision)
 

diff --git a/src/lightning_habana/pytorch/strategies/deepspeed.py b/src/lightning_habana/pytorch/strategies/deepspeed.py
@@ -86,8 +86,8 @@
 warning_cache = WarningCache()
 
 _HPU_DEEPSPEED_AVAILABLE = (
-    # HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0
-    RequirementCache("deepspeed==0.14.0+hpu.synapse.v1.17.0")
+    # HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+    RequirementCache("deepspeed==0.14.4+hpu.synapse.v1.18.0")
 )
 if TYPE_CHECKING and _HPU_DEEPSPEED_AVAILABLE:
     import deepspeed
@@ -302,7 +302,7 @@ def __init__(
         if not _HPU_DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
                 "To use the `HPUDeepSpeedStrategy`, you must have hpu DeepSpeed installed."
-                " Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0`."
+                " Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
             )
 
         super().__init__(

diff --git a/src/lightning_habana/utils/resources.py b/src/lightning_habana/utils/resources.py
@@ -169,17 +169,19 @@ def is_fp8_available() -> Tuple[bool, str]:
     """Returns a bool indicating if fp8 is available."""
     if not _HABANA_FRAMEWORK_AVAILABLE:
         raise OSError("Habana Frameworks required for training on Habana devices.")
-    import habana_frameworks.torch.hpex.experimental.transformer_engine as tengine
 
-    return tengine.fp8.is_fp8_available()
+    if get_device_name_from_hlsmi() == "GAUDI":
+        return False, "FP8 not supported on Gaudi, Gaudi2 or higher required"
+    return True, ""
 
 
 @lru_cache
 def is_fp16_available() -> Tuple[bool, str]:
     """Returns a bool indicating if fp16 is available."""
     if not _HABANA_FRAMEWORK_AVAILABLE:
         raise OSError("Habana Frameworks required for training on Habana devices.")
-    if torch_hpu.get_device_name() == "GAUDI":
+
+    if get_device_name_from_hlsmi() == "GAUDI":
         return False, "FP16 not supported on Gaudi, Gaudi2 or higher required."
     return True, ""
 

diff --git a/tests/test_pytorch/strategies/test_deepspeed.py b/tests/test_pytorch/strategies/test_deepspeed.py
@@ -905,7 +905,7 @@ def test_step(self, batch, batch_idx):
     bf16_loss = torch.tensor(0.6641)
     if device_count == 2:
         bf16_loss = torch.tensor(1.2734)
-    assert torch.allclose(bf16_test_loss, bf16_loss, rtol=1e-5, atol=1e-5)
+    assert torch.allclose(bf16_test_loss, bf16_loss, rtol=1e-4, atol=1e-4)
 
 
 @pytest.mark.standalone_only()  # HQT cannot be reconfigured in same process

diff --git a/tests/test_pytorch/test_compile.py b/tests/test_pytorch/test_compile.py
@@ -226,9 +226,10 @@ def test_ddp_strategy_with_compile(tmp_path, arg_hpus):
     ("record_module_names", "expectation"),
     [
         (False, nullcontext()),
-        (
+        pytest.param(
             True,
             pytest.raises(TypeError, match=r"nullcontext.__enter__\(\) missing 1 required positional argument: 'self'"),
+            marks=pytest.mark.xfail(reason="Failure with 1.18"),
         ),
     ],
 )

diff --git a/tests/test_pytorch/test_dynamic_shapes.py b/tests/test_pytorch/test_dynamic_shapes.py
@@ -61,11 +61,11 @@ def run_training(tmpdir, hpus, model, data_module):
 
 def test_dynamic_shapes_recompilations_recipe_caching(tmpdir, arg_hpus, monkeypatch):
     """Tests number of recompilations between cached and non-cached runs."""
+    with monkeypatch.context() as m:
+        m.setenv("PT_HPU_RECIPE_CACHE_CONFIG", f"{tmpdir}/recipes/,true,1024")
+        cached_compiles = run_training(tmpdir, hpus=arg_hpus, model=DynamicOpsBoringModel, data_module=BoringDataModule)
     default_compiles = run_training(tmpdir, hpus=arg_hpus, model=DynamicOpsBoringModel, data_module=BoringDataModule)
 
-    monkeypatch.setenv("PT_HPU_RECIPE_CACHE_CONFIG", f"{tmpdir}/recipes,True,1024")
-    cached_compiles = run_training(tmpdir, hpus=arg_hpus, model=DynamicOpsBoringModel, data_module=BoringDataModule)
-
     assert cached_compiles[0] <= default_compiles[0]
 
 

diff --git a/tests/test_pytorch/test_precision.py b/tests/test_pytorch/test_precision.py
@@ -118,10 +118,10 @@ class BMPluginActive(BaseBM):
     def forward(self, x):
         """Forward."""
         if self.trainer.precision == "fp8":
-            assert tengine.fp8.is_fp8_enabled()
+            assert tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
             assert not torch.hpu.is_autocast_hpu_enabled()
         else:
-            assert not tengine.fp8.is_fp8_enabled()
+            assert not tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
             assert torch.hpu.is_autocast_hpu_enabled()
         return super().forward(x)
 
@@ -307,7 +307,13 @@ def test_hpu_precision_fp8_patch(patch_path, tmpdir, fp8_config):
             pytest.raises(FileNotFoundError),
         ),
         (
-            {"inference": True, "quant": {"mode": "MEASURE"}},
+            {
+                "inference": True,
+                "quant": {
+                    "mode": "MEASURE",
+                    "allowlist": {"types": [], "names": []},
+                },
+            },
             nullcontext(),
         ),
         ({"inference": False}, nullcontext()),
@@ -365,17 +371,17 @@ def test_hpu_precision_fp8_inference_with_quant_dict(tmpdir):
 @pytest.mark.standalone_only()
 @pytest.mark.skipif(get_device_name_from_hlsmi() == "GAUDI", reason="fp8 supported on Gaudi2 and above.")
 def test_hpu_precision_fp8_inference_log_files(tmpdir):
-    log_file = os.path.join(os.environ["HABANA_LOGS"], "hqt_log.txt")
-    # remove log file if it exists
+    log_file = os.path.join(os.environ["HABANA_LOGS"], "inc_log.txt")
+    file_size = 0  # if file does not exist
     if os.path.isfile(log_file):
-        os.remove(log_file)
+        file_size = os.path.getsize(log_file)  # file exists. log will be appended.
 
     precision_plugin = HPUPrecisionPlugin(precision="fp8")
     precision_plugin.convert_modules(module=BoringModel(), inference=True, quant=False, fp8_data_path=tmpdir)
 
     # check log file is created with size > 0
     assert os.path.isfile(log_file)
-    assert os.path.getsize(log_file) > 0
+    assert os.path.getsize(log_file) > file_size
 
 
 @pytest.mark.standalone_only()
@@ -781,9 +787,9 @@ def training_step(self, batch, batch_idx):
             # fp8 training is only enabled when precision is fp8,
             # even if module used is from transformer engine.
             if precision == "fp8":
-                assert tengine.fp8.is_fp8_enabled()
+                assert tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
             else:
-                assert not tengine.fp8.is_fp8_enabled()
+                assert not tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
             return super().training_step(batch, batch_idx)
 
         def configure_optimizers(self):