Skip to content

Commit

Permalink
Merge branch 'main' into 117_hccl
Browse files Browse the repository at this point in the history
  • Loading branch information
jerome-habana authored Oct 17, 2024
2 parents a082c9d + 1c4e7e3 commit 1c4f577
Show file tree
Hide file tree
Showing 11 changed files with 48 additions and 38 deletions.
20 changes: 10 additions & 10 deletions .azure/hpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ jobs:
strategy:
matrix:
'w. pytorch-lightning | pypi':
image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
dependency: "pytorch-lightning"
pkg_source: "pypi"
'w. pytorch-lightning | source':
image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
dependency: "pytorch-lightning"
pkg_source: "source"
'w. lightning | pypi':
image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
dependency: "lightning"
pkg_source: "pypi"
'w. lightning | source':
image: "1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"
image: "1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
dependency: "lightning"
pkg_source: "source"
pool: "intel-hpus"
Expand All @@ -58,7 +58,7 @@ jobs:
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
MODULE_ID: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
DEEPSPEED_VERSION: "1.17.0"
DEEPSPEED_VERSION: "1.18.0"

workspace:
clean: all
Expand Down Expand Up @@ -107,7 +107,7 @@ jobs:
tests/test_fabric/test_accelerator.py \
tests/test_fabric/test_strategy.py \
tests/test_fabric/test_precision.py \
--hpus 1 --junitxml=hpu_test-fabric-results.xml
--hpus 1 -W ignore::FutureWarning --junitxml=hpu_test-fabric-results.xml
python -m pytest -sv \
tests/test_pytorch/test_accelerator.py \
tests/test_pytorch/test_hpu_graphs.py \
Expand All @@ -117,13 +117,13 @@ jobs:
tests/test_pytorch/test_precision.py \
tests/test_pytorch/strategies/test_hpu_parallel.py \
tests/test_pytorch/strategies/test_hpu_ddp.py \
--hpus 1 -m "not standalone_only" \
--hpus 1 -W ignore::FutureWarning -m "not standalone_only" \
--junitxml=hpu_test-torch-results.xml
displayName: 'HPU General tests'
- bash: |
python -m pytest -sv tests/test_pytorch/test_compile.py \
--hpus 1 --junitxml=hpu_compile_test-results.xml
--hpus 1 -W ignore::FutureWarning --junitxml=hpu_compile_test-results.xml
env:
PT_HPU_LAZY_MODE: 0
displayName: 'HPU torch compile tests'
Expand All @@ -139,13 +139,13 @@ jobs:

- bash: |
python -m pytest -sv tests/test_pytorch/strategies/test_deepspeed.py \
-m "not standalone_only" --junitxml=hpu_deepspeed_test-results.xml
-m "not standalone_only" -W ignore::FutureWarning --junitxml=hpu_deepspeed_test-results.xml
displayName: 'HPU Deepspeed tests'
- bash: |
python -m pytest -sv tests/test_pytorch/test_precision.py \
-k test_autocast_operators_override --runxfail \
--junitxml=hpu_precision_test_override-results.xml
-W ignore::FutureWarning --junitxml=hpu_precision_test_override-results.xml
env:
LOWER_LIST: tests/test_pytorch/ops_fp32.txt
FP32_LIST: tests/test_pytorch/ops_bf16.txt
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ repos:
#args: ["--write-changes"] # uncomment if you want to get automatic fixing

- repo: https://github.com/PyCQA/docformatter
rev: v1.7.5
rev: 06907d0267368b49b9180eed423fae5697c1e909 # todo: fix for docformatter after last 1.7.5
hooks:
- id: docformatter
additional_dependencies: [tomli]
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.7.0] - 2024-09-DD
## [1.7.0] - 2024-10-DD

### Added

Expand All @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Modified torch device specification for FSDP on HPU ([#222](https://github.com/Lightning-AI/lightning-Habana/pull/222))
- Updated strategy to use default fork ([#234](https://github.com/Lightning-AI/lightning-Habana/pull/234))
- Updated hpu parallel strategy as base class ([#237](https://github.com/Lightning-AI/lightning-Habana/pull/237))
- Updated to Intel Gaudi software Release 1.18.0 ([#245](https://github.com/Lightning-AI/lightning-Habana/pull/245))

### Fixed

Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ The `devices>1` parameter with HPUs enables the Habana accelerator for distribut

# Support Matrix

| **SynapseAI** | **1.17.0** |
| **SynapseAI** | **1.18.0** |
| --------------------- | --------------------------------------------------- |
| PyTorch | 2.3.1 |
| PyTorch | 2.4.0 |
| (PyTorch) Lightning\* | 2.4.x |
| **Lightning Habana** | **1.7.0** |
| DeepSpeed\*\* | Forked from v0.14.0 of the official DeepSpeed repo. |
| DeepSpeed\*\* | Forked from v0.14.4 of the official DeepSpeed repo. |

\* covers both packages [`lightning`](https://pypi.org/project/lightning/) and [`pytorch-lightning`](https://pypi.org/project/pytorch-lightning/)

Expand Down
6 changes: 3 additions & 3 deletions src/lightning_habana/pytorch/plugins/deepspeed_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
from habana_frameworks.torch.hpex.experimental.transformer_engine.recipe import DelayedScaling

_HPU_DEEPSPEED_AVAILABLE = (
# HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0
RequirementCache("deepspeed==0.14.0+hpu.synapse.v1.17.0")
# HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
RequirementCache("deepspeed==0.14.4+hpu.synapse.v1.18.0")
)
if _HPU_DEEPSPEED_AVAILABLE:
import deepspeed
Expand Down Expand Up @@ -76,7 +76,7 @@ def __init__(
if not _HPU_DEEPSPEED_AVAILABLE:
raise MisconfigurationException(
"To use the `HPUDeepSpeedPrecisionPlugin`, you must have hpu DeepSpeed installed."
" Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0`."
" Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
)
super().__init__(device=device, precision=precision)

Expand Down
6 changes: 3 additions & 3 deletions src/lightning_habana/pytorch/strategies/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@
warning_cache = WarningCache()

_HPU_DEEPSPEED_AVAILABLE = (
# HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0
RequirementCache("deepspeed==0.14.0+hpu.synapse.v1.17.0")
# HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
RequirementCache("deepspeed==0.14.4+hpu.synapse.v1.18.0")
)
if TYPE_CHECKING and _HPU_DEEPSPEED_AVAILABLE:
import deepspeed
Expand Down Expand Up @@ -302,7 +302,7 @@ def __init__(
if not _HPU_DEEPSPEED_AVAILABLE:
raise MisconfigurationException(
"To use the `HPUDeepSpeedStrategy`, you must have hpu DeepSpeed installed."
" Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0`."
" Install it by running `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0`."
)

super().__init__(
Expand Down
8 changes: 5 additions & 3 deletions src/lightning_habana/utils/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,19 @@ def is_fp8_available() -> Tuple[bool, str]:
"""Returns a bool indicating if fp8 is available."""
if not _HABANA_FRAMEWORK_AVAILABLE:
raise OSError("Habana Frameworks required for training on Habana devices.")
import habana_frameworks.torch.hpex.experimental.transformer_engine as tengine

return tengine.fp8.is_fp8_available()
if get_device_name_from_hlsmi() == "GAUDI":
return False, "FP8 not supported on Gaudi, Gaudi2 or higher required"
return True, ""


@lru_cache
def is_fp16_available() -> Tuple[bool, str]:
"""Returns a bool indicating if fp16 is available."""
if not _HABANA_FRAMEWORK_AVAILABLE:
raise OSError("Habana Frameworks required for training on Habana devices.")
if torch_hpu.get_device_name() == "GAUDI":

if get_device_name_from_hlsmi() == "GAUDI":
return False, "FP16 not supported on Gaudi, Gaudi2 or higher required."
return True, ""

Expand Down
2 changes: 1 addition & 1 deletion tests/test_pytorch/strategies/test_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,7 @@ def test_step(self, batch, batch_idx):
bf16_loss = torch.tensor(0.6641)
if device_count == 2:
bf16_loss = torch.tensor(1.2734)
assert torch.allclose(bf16_test_loss, bf16_loss, rtol=1e-5, atol=1e-5)
assert torch.allclose(bf16_test_loss, bf16_loss, rtol=1e-4, atol=1e-4)


@pytest.mark.standalone_only() # HQT cannot be reconfigured in same process
Expand Down
3 changes: 2 additions & 1 deletion tests/test_pytorch/test_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,10 @@ def test_ddp_strategy_with_compile(tmp_path, arg_hpus):
("record_module_names", "expectation"),
[
(False, nullcontext()),
(
pytest.param(
True,
pytest.raises(TypeError, match=r"nullcontext.__enter__\(\) missing 1 required positional argument: 'self'"),
marks=pytest.mark.xfail(reason="Failure with 1.18"),
),
],
)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_pytorch/test_dynamic_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ def run_training(tmpdir, hpus, model, data_module):

def test_dynamic_shapes_recompilations_recipe_caching(tmpdir, arg_hpus, monkeypatch):
"""Tests number of recompilations between cached and non-cached runs."""
with monkeypatch.context() as m:
m.setenv("PT_HPU_RECIPE_CACHE_CONFIG", f"{tmpdir}/recipes/,true,1024")
cached_compiles = run_training(tmpdir, hpus=arg_hpus, model=DynamicOpsBoringModel, data_module=BoringDataModule)
default_compiles = run_training(tmpdir, hpus=arg_hpus, model=DynamicOpsBoringModel, data_module=BoringDataModule)

monkeypatch.setenv("PT_HPU_RECIPE_CACHE_CONFIG", f"{tmpdir}/recipes,True,1024")
cached_compiles = run_training(tmpdir, hpus=arg_hpus, model=DynamicOpsBoringModel, data_module=BoringDataModule)

assert cached_compiles[0] <= default_compiles[0]


Expand Down
24 changes: 15 additions & 9 deletions tests/test_pytorch/test_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ class BMPluginActive(BaseBM):
def forward(self, x):
"""Forward."""
if self.trainer.precision == "fp8":
assert tengine.fp8.is_fp8_enabled()
assert tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
assert not torch.hpu.is_autocast_hpu_enabled()
else:
assert not tengine.fp8.is_fp8_enabled()
assert not tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
assert torch.hpu.is_autocast_hpu_enabled()
return super().forward(x)

Expand Down Expand Up @@ -307,7 +307,13 @@ def test_hpu_precision_fp8_patch(patch_path, tmpdir, fp8_config):
pytest.raises(FileNotFoundError),
),
(
{"inference": True, "quant": {"mode": "MEASURE"}},
{
"inference": True,
"quant": {
"mode": "MEASURE",
"allowlist": {"types": [], "names": []},
},
},
nullcontext(),
),
({"inference": False}, nullcontext()),
Expand Down Expand Up @@ -365,17 +371,17 @@ def test_hpu_precision_fp8_inference_with_quant_dict(tmpdir):
@pytest.mark.standalone_only()
@pytest.mark.skipif(get_device_name_from_hlsmi() == "GAUDI", reason="fp8 supported on Gaudi2 and above.")
def test_hpu_precision_fp8_inference_log_files(tmpdir):
log_file = os.path.join(os.environ["HABANA_LOGS"], "hqt_log.txt")
# remove log file if it exists
log_file = os.path.join(os.environ["HABANA_LOGS"], "inc_log.txt")
file_size = 0 # if file does not exist
if os.path.isfile(log_file):
os.remove(log_file)
file_size = os.path.getsize(log_file) # file exists. log will be appended.

precision_plugin = HPUPrecisionPlugin(precision="fp8")
precision_plugin.convert_modules(module=BoringModel(), inference=True, quant=False, fp8_data_path=tmpdir)

# check log file is created with size > 0
assert os.path.isfile(log_file)
assert os.path.getsize(log_file) > 0
assert os.path.getsize(log_file) > file_size


@pytest.mark.standalone_only()
Expand Down Expand Up @@ -781,9 +787,9 @@ def training_step(self, batch, batch_idx):
# fp8 training is only enabled when precision is fp8,
# even if module used is from transformer engine.
if precision == "fp8":
assert tengine.fp8.is_fp8_enabled()
assert tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
else:
assert not tengine.fp8.is_fp8_enabled()
assert not tengine.fp8.FP8GlobalStateManager.is_fp8_enabled()
return super().training_step(batch, batch_idx)

def configure_optimizers(self):
Expand Down

0 comments on commit 1c4f577

Please sign in to comment.