Add utility for filtering out skpped tests in large paremtrization groups

drisspg · drisspg · commit 653e1200c672 · 2024-07-03T12:00:44.000-07:00
ghstack-source-id: 5c80e94 Pull Request resolved: #303
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -296,7 +296,7 @@ def cast_x_to_float8(
         if torch.is_autocast_enabled():
             # For now, hardcode to GPU's autocast dtype
             # if we need CPU support in the future, we can add it
-            autocast_dtype = torch.get_autocast_gpu_dtype()
+            autocast_dtype = torch.get_autocast_dtype("cuda")
             x = x.to(autocast_dtype)
 
         if self.scaling_type_x is TensorScalingType.DELAYED:
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -274,7 +274,7 @@ def sync_float8_amax_and_scale_history(model: torch.nn.Module, fp8_layers=None)
         fp8_layers = get_float8_layers(model)
 
     if len(fp8_layers) == 0:
-        log.warn(
+        log.warning(
             "Calling sync_float8_amax_and_scale_history on a module with no Float8Linear layers"
         )
         return
diff --git a/test/test_base.py b/test/test_base.py
@@ -9,6 +9,8 @@
 import re
 import unittest
 import warnings
+from itertools import product
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import pytest
 
@@ -52,6 +54,42 @@
 is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
 
 
+def filtered_parametrize(
+    param_list: List[Tuple[str, List[Any]]],
+    filter_func: Optional[Callable[[Dict[str, Any]], bool]] = None,
+):
+    """
+    A decorator that works like pytest.mark.parametrize but filters out
+    unwanted parameter combinations.
+
+    Args:
+        param_list: A list of tuples, each containing (arg_name, [arg_values])
+        filter_func: A function that takes a dictionary of parameter names and values,
+                     and returns True for valid combinations, False otherwise
+
+    """
+
+    def decorator(func):
+        arg_names = [param[0] for param in param_list]
+        arg_values = [param[1] for param in param_list]
+
+        all_combinations = product(*arg_values)
+        if filter_func:
+            valid_combinations = [
+                combo
+                for combo in all_combinations
+                if filter_func(dict(zip(arg_names, combo)))
+            ]
+        else:
+            valid_combinations = list(all_combinations)
+
+        return pytest.mark.parametrize(
+            argnames=arg_names, argvalues=valid_combinations
+        )(func)
+
+    return decorator
+
+
 def bitwise_identical(a: Float8Tensor, b: Float8Tensor) -> bool:
     assert torch.all(a._data == b._data).item(), "scales are not identical"
     assert torch.all(a._data == b._data).item(), "data is not identical"
@@ -230,17 +268,35 @@ def _test_linear_impl(
             # verify initialization flags got updated
             assert m_fp8.is_amax_initialized, "Amax was not properly initialized"
 
-    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
-    @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
-    @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
-    @pytest.mark.parametrize(
-        "scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]
-    )
-    @pytest.mark.parametrize(
-        "scaling_type_w", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]
-    )
-    @pytest.mark.parametrize(
-        "scaling_type_dL_dY", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]
+    @staticmethod
+    def is_valid_combination(params):
+        if not params["emulate"]:
+            if not torch.cuda.is_available():
+                return False
+            if torch.cuda.get_device_capability() < (9, 0):
+                return False
+
+        if params["linear_type"] == LinearType.DYNAMIC:
+            return all(
+                params[key] == TensorScalingType.DYNAMIC
+                for key in ["scaling_type_x", "scaling_type_w", "scaling_type_dL_dY"]
+            )
+
+        return True
+
+    @filtered_parametrize(
+        [
+            ("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)]),
+            ("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]),
+            ("emulate", [True, False] if is_H100 else [True]),
+            ("scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]),
+            ("scaling_type_w", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]),
+            (
+                "scaling_type_dL_dY",
+                [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC],
+            ),
+        ],
+        filter_func=is_valid_combination,
     )
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_linear_nobias(
@@ -252,28 +308,6 @@ def test_linear_nobias(
         scaling_type_w: TensorScalingType,
         scaling_type_dL_dY: TensorScalingType,
     ):
-        if not emulate:
-            if not torch.cuda.is_available():
-                warnings.warn("CUDA not available")
-                pytest.skip()
-            elif torch.cuda.get_device_capability() < (9, 0):
-                warnings.warn(
-                    f"CUDA capability {torch.cuda.get_device_capability()} < (9.0)"
-                )
-                pytest.skip()
-        if linear_type is LinearType.DYNAMIC:
-            # Only test one combination of scaling types, as they are a no-op
-            # for Float8DynamicLinear. It would be cleaner to split into two
-            # tests, but IMO not worth it since Float8DynamicLinear will be
-            # deleted soon
-            is_all_dynamic = (
-                scaling_type_x is TensorScalingType.DYNAMIC
-                and scaling_type_w is TensorScalingType.DYNAMIC
-                and scaling_type_dL_dY is TensorScalingType.DYNAMIC
-            )
-            if not is_all_dynamic:
-                pytest.skip()
-
         x = torch.randn(*x_shape, device="cuda")
         m_ref = nn.Linear(16, 32, bias=False, device="cuda")
         self._test_linear_impl(
@@ -286,20 +320,20 @@ def test_linear_nobias(
             scaling_type_dL_dY,
         )
 
-    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
-    @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
-    @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
-    @pytest.mark.parametrize(
-        "scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]
-    )
-    @pytest.mark.parametrize(
-        "scaling_type_w", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]
-    )
-    @pytest.mark.parametrize(
-        "scaling_type_dL_dY", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]
-    )
-    @pytest.mark.parametrize(
-        "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
+    @filtered_parametrize(
+        [
+            ("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)]),
+            ("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC]),
+            ("emulate", [True, False] if is_H100 else [True]),
+            ("scaling_type_x", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]),
+            ("scaling_type_w", [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC]),
+            (
+                "scaling_type_dL_dY",
+                [TensorScalingType.DELAYED, TensorScalingType.DYNAMIC],
+            ),
+            ("linear_dtype", [torch.float16, torch.bfloat16, torch.float32]),
+        ],
+        filter_func=is_valid_combination,
     )
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_linear_bias(
@@ -312,28 +346,6 @@ def test_linear_bias(
         emulate: bool,
         linear_dtype: torch.dtype,
     ):
-        if not emulate:
-            if not torch.cuda.is_available():
-                warnings.warn("CUDA not available")
-                pytest.skip()
-            elif torch.cuda.get_device_capability() < (9, 0):
-                warnings.warn(
-                    f"CUDA capability {torch.cuda.get_device_capability()} < (9.0)"
-                )
-                pytest.skip()
-        if linear_type is LinearType.DYNAMIC:
-            # Only test one combination of scaling types, as they are a no-op
-            # for Float8DynamicLinear. It would be cleaner to split into two
-            # tests, but IMO not worth it since Float8DynamicLinear will be
-            # deleted soon
-            is_all_dynamic = (
-                scaling_type_x is TensorScalingType.DYNAMIC
-                and scaling_type_w is TensorScalingType.DYNAMIC
-                and scaling_type_dL_dY is TensorScalingType.DYNAMIC
-            )
-            if not is_all_dynamic:
-                pytest.skip()
-
         x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
         m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
         self._test_linear_impl(

Original file line number	Diff line number	Diff line change
`@@ -274,7 +274,7 @@ def sync_float8_amax_and_scale_history(model: torch.nn.Module, fp8_layers=None)`
`274`	`274`	`fp8_layers = get_float8_layers(model)`
`275`	`275`
`276`	`276`	`if len(fp8_layers) == 0:`
`277`		`- log.warn(`
	`277`	`+ log.warning(`
`278`	`278`	`"Calling sync_float8_amax_and_scale_history on a module with no Float8Linear layers"`
`279`	`279`	`)`
`280`	`280`	`return`