From cf2bc6c7a00eec81fbe23b2ee28b32357531f494 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Tue, 3 Jun 2025 10:40:22 +0000 Subject: [PATCH 1/7] fixed the bug Signed-off-by: Pawel Gadzinski --- tests/pytorch/debug/test_numerics.py | 30 +++++++++++++++++++ .../debug/features/utils/stats_buffer.py | 3 ++ .../debug/features/utils/stats_computation.py | 2 ++ 3 files changed, 35 insertions(+) diff --git a/tests/pytorch/debug/test_numerics.py b/tests/pytorch/debug/test_numerics.py index 55c3ab9b7e..f5a12afe30 100644 --- a/tests/pytorch/debug/test_numerics.py +++ b/tests/pytorch/debug/test_numerics.py @@ -262,6 +262,21 @@ def _get_tensors(): return x, weight +LOGGING_CONFIG = """logging_config: + enabled: True + layers: + layer_types: [linear] + transformer_engine: + LogTensorStats: + enabled: True + tensors: [activation, gradient, weight, output, wgrad, dgrad] + stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range] +""" + + + + + DISABLE_FP8_CONFIG = Template( """disable_fp8_config: enabled: True @@ -274,6 +289,21 @@ def _get_tensors(): """ ) +@create_config_file +def run_logging_zero_numel_tensor(feature_dirs, **kwargs): + kwargs["config_file"].write(LOGGING_CONFIG) + kwargs["config_file"].flush() + + _init_debug(kwargs["config_file"].name, kwargs["log_dir"], feature_dirs) + + x, weight = _get_tensors() + x1 = x[:0, :] + model = _init_model(weight) + _ = _run_forward_backward(x1, model) + _ = _run_forward_backward(x, model) + +def test_logging_zero_numel_tensor(feature_dirs): + run_logging_zero_numel_tensor(feature_dirs) @pytest.mark.parametrize("fprop_fp8", all_boolean) @pytest.mark.parametrize("dgrad_fp8", all_boolean) diff --git a/transformer_engine/debug/features/utils/stats_buffer.py b/transformer_engine/debug/features/utils/stats_buffer.py index 2313484054..c99642786d 100644 --- a/transformer_engine/debug/features/utils/stats_buffer.py +++ b/transformer_engine/debug/features/utils/stats_buffer.py @@ -84,6 +84,9 @@ def feed(self, tensor, iteration): # It is used for weights and microbatching. if self.modified[0] and not self.reduce_within_microbatch: return + + if tensor.numel() == 0: + return # save stats for tensor to tmp buffer for stat_name in self.stats_to_compute: diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py index d111e48903..4d16c64f01 100644 --- a/transformer_engine/debug/features/utils/stats_computation.py +++ b/transformer_engine/debug/features/utils/stats_computation.py @@ -17,6 +17,8 @@ def _compute_dynamic_range_top(tensor): """Computes the log2 of the amax of the tensor""" tensor_abs = tensor.abs() tensor_abs = tensor_abs[tensor_abs != 0] + if tensor_abs.numel() == 0: + return torch.inf amax = tensor_abs.max().float() if not amax.all(): amax = torch.tensor(1, device=tensor.device).to(torch.float) From f6734db5768c0008d09c94437f638b03ce22534f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Jun 2025 10:43:04 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/pytorch/debug/test_numerics.py | 6 +++--- transformer_engine/debug/features/utils/stats_buffer.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/pytorch/debug/test_numerics.py b/tests/pytorch/debug/test_numerics.py index f5a12afe30..6a89149c7a 100644 --- a/tests/pytorch/debug/test_numerics.py +++ b/tests/pytorch/debug/test_numerics.py @@ -274,9 +274,6 @@ def _get_tensors(): """ - - - DISABLE_FP8_CONFIG = Template( """disable_fp8_config: enabled: True @@ -289,6 +286,7 @@ def _get_tensors(): """ ) + @create_config_file def run_logging_zero_numel_tensor(feature_dirs, **kwargs): kwargs["config_file"].write(LOGGING_CONFIG) @@ -302,9 +300,11 @@ def run_logging_zero_numel_tensor(feature_dirs, **kwargs): _ = _run_forward_backward(x1, model) _ = _run_forward_backward(x, model) + def test_logging_zero_numel_tensor(feature_dirs): run_logging_zero_numel_tensor(feature_dirs) + @pytest.mark.parametrize("fprop_fp8", all_boolean) @pytest.mark.parametrize("dgrad_fp8", all_boolean) @pytest.mark.parametrize("wgrad_fp8", all_boolean) diff --git a/transformer_engine/debug/features/utils/stats_buffer.py b/transformer_engine/debug/features/utils/stats_buffer.py index c99642786d..ce1ef93864 100644 --- a/transformer_engine/debug/features/utils/stats_buffer.py +++ b/transformer_engine/debug/features/utils/stats_buffer.py @@ -84,7 +84,7 @@ def feed(self, tensor, iteration): # It is used for weights and microbatching. if self.modified[0] and not self.reduce_within_microbatch: return - + if tensor.numel() == 0: return From 30bfd4eb105628bf2faf426bbd7bcc9bf1eeaf68 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 9 Jun 2025 16:30:51 +0200 Subject: [PATCH 3/7] fix Signed-off-by: Pawel Gadzinski --- transformer_engine/debug/features/utils/stats_buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/debug/features/utils/stats_buffer.py b/transformer_engine/debug/features/utils/stats_buffer.py index ce1ef93864..3cfae1854b 100644 --- a/transformer_engine/debug/features/utils/stats_buffer.py +++ b/transformer_engine/debug/features/utils/stats_buffer.py @@ -85,7 +85,7 @@ def feed(self, tensor, iteration): if self.modified[0] and not self.reduce_within_microbatch: return - if tensor.numel() == 0: + if tensor.numel() == 0 if hasattr(tensor, "numel") else all(t.numel() == 0 for t in tensor.get_data_tensors()): return # save stats for tensor to tmp buffer From 7e1222feeba2d9c3154e26dedaa7150ce122d84d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Jun 2025 14:31:20 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/debug/features/utils/stats_buffer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/transformer_engine/debug/features/utils/stats_buffer.py b/transformer_engine/debug/features/utils/stats_buffer.py index 3cfae1854b..ee22ac6104 100644 --- a/transformer_engine/debug/features/utils/stats_buffer.py +++ b/transformer_engine/debug/features/utils/stats_buffer.py @@ -85,7 +85,11 @@ def feed(self, tensor, iteration): if self.modified[0] and not self.reduce_within_microbatch: return - if tensor.numel() == 0 if hasattr(tensor, "numel") else all(t.numel() == 0 for t in tensor.get_data_tensors()): + if ( + tensor.numel() == 0 + if hasattr(tensor, "numel") + else all(t.numel() == 0 for t in tensor.get_data_tensors()) + ): return # save stats for tensor to tmp buffer From c9e70f6c459e73f0bc8863e18cf571ea0a2d58b8 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 9 Jun 2025 16:32:18 +0200 Subject: [PATCH 5/7] lint fix Signed-off-by: Pawel Gadzinski --- transformer_engine/debug/features/log_fp8_tensor_stats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py index e5c84a9bda..1be9b586f9 100644 --- a/transformer_engine/debug/features/log_fp8_tensor_stats.py +++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py @@ -141,6 +141,7 @@ def inspect_tensor_postquantize( stat in self._get_supported_stats_list() ), f"[NVTORCH INSPECT ERROR] Statistic {stat} is not supported." + STATS_BUFFERS.try_add_buffer( layer_name=layer_name, tensor_name=tensor_name, From dce4846f7279411e8931fa70a81c1ca61cf472e0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Jun 2025 14:32:48 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/debug/features/log_fp8_tensor_stats.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py index 1be9b586f9..e5c84a9bda 100644 --- a/transformer_engine/debug/features/log_fp8_tensor_stats.py +++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py @@ -141,7 +141,6 @@ def inspect_tensor_postquantize( stat in self._get_supported_stats_list() ), f"[NVTORCH INSPECT ERROR] Statistic {stat} is not supported." - STATS_BUFFERS.try_add_buffer( layer_name=layer_name, tensor_name=tensor_name, From 732f12cc04170daecd4a03211216ba5388069e01 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 9 Jun 2025 17:04:23 +0200 Subject: [PATCH 7/7] fix Signed-off-by: Pawel Gadzinski --- transformer_engine/debug/features/utils/stats_buffer.py | 2 +- transformer_engine/debug/features/utils/stats_computation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/debug/features/utils/stats_buffer.py b/transformer_engine/debug/features/utils/stats_buffer.py index ee22ac6104..4be465f8e8 100644 --- a/transformer_engine/debug/features/utils/stats_buffer.py +++ b/transformer_engine/debug/features/utils/stats_buffer.py @@ -88,7 +88,7 @@ def feed(self, tensor, iteration): if ( tensor.numel() == 0 if hasattr(tensor, "numel") - else all(t.numel() == 0 for t in tensor.get_data_tensors()) + else all((t is None or t.numel() == 0) for t in tensor.get_data_tensors()) ): return diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py index 4d16c64f01..ed32de1ae2 100644 --- a/transformer_engine/debug/features/utils/stats_computation.py +++ b/transformer_engine/debug/features/utils/stats_computation.py @@ -127,7 +127,7 @@ def _get(buffers, stat_name): lambda buffers: min(_get(buffers, "dynamic_range_bottom")), ), "underflows_num": ( - lambda x: (x._data == 0).sum(), + lambda x: (x.get_data_tensors()[0] == 0).sum(), lambda buffers: sum(_get(buffers, "underflows_num")), ), "std": (