Add FP8 support to CP implementation with KV P2P (#1114)

* add window_size to AttnFuncWithCP Signed-off-by: Xiaowei Ren <[email protected]> * add seq_offsets_qkvo for cudnn thd Signed-off-by: Xiaowei Ren <[email protected]> * add seq_offsets_qkvo to AttnFuncWithCP Signed-off-by: Xiaowei Ren <[email protected]> * fix seq_offsets calculation of cudnn thd Signed-off-by: Xiaowei Ren <[email protected]> * remove a thd assert Signed-off-by: Xiaowei Ren <[email protected]> * fix bias for thd test Signed-off-by: Xiaowei Ren <[email protected]> * add thd test for cudnn FA with CP Signed-off-by: Xiaowei Ren <[email protected]> * skip GQA/MQA test for cuDNN THD Signed-off-by: Xiaowei Ren <[email protected]> * make sure seq_offsets are computed with qkv_group of hd_hd_hd while CP>1 Signed-off-by: Xiaowei Ren <[email protected]> * fix seq_offsets inputs Signed-off-by: Xiaowei Ren <[email protected]> * remove two comments Signed-off-by: Xiaowei Ren <[email protected]> * fix attn mask type for cudnn thd with cp Signed-off-by: Xiaowei Ren <[email protected]> * fix attn_mask_type check Signed-off-by: Xiaowei Ren <[email protected]> * fix attn_mask_type for cudnn fa with thd Signed-off-by: Xiaowei Ren <[email protected]> * fix a typo Signed-off-by: Xiaowei Ren <[email protected]> * fix out dout in bwd Signed-off-by: Xiaowei Ren <[email protected]> * assert cudnn+thd does not support attn bias Signed-off-by: Xiaowei Ren <[email protected]> * check if attn_mask_type has padding Signed-off-by: Xiaowei Ren <[email protected]> * minor change Signed-off-by: Xiaowei Ren <[email protected]> * change cp test batch size to 2 Signed-off-by: Xiaowei Ren <[email protected]> * fix code format Signed-off-by: Xiaowei Ren <[email protected]> * fix two assert info Signed-off-by: Xiaowei Ren <[email protected]> * fix assert comment Signed-off-by: Xiaowei Ren <[email protected]> * fix assert comments Signed-off-by: Xiaowei Ren <[email protected]> * minor fix Signed-off-by: Xiaowei Ren <[email protected]> * fix assert comments Signed-off-by: Xiaowei Ren <[email protected]> * assert swa+CP cannot work with thd format Signed-off-by: Xiaowei Ren <[email protected]> * add a new CP function for swa Signed-off-by: Xiaowei Ren <[email protected]> * add a missing dgrads Signed-off-by: Xiaowei Ren <[email protected]> * minor change Signed-off-by: Xiaowei Ren <[email protected]> * add draft fwd function for swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * minor change Signed-off-by: Xiaowei Ren <[email protected]> * enable flash attention for swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * remove an assert of swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * call SWAFuncWithCP for swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * typo fix Signed-off-by: Xiaowei Ren <[email protected]> * use 2hd layout Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change qkv_format check Signed-off-by: Xiaowei Ren <[email protected]> * add a code comment Signed-off-by: Xiaowei Ren <[email protected]> * tensor shape bug fix Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tensor shape fix Signed-off-by: Xiaowei Ren <[email protected]> * add function to compute cu_seqlens of a cp rank Signed-off-by: Xiaowei Ren <[email protected]> * add cu_seqlens and cu_seqlens_padded to context parallelism Signed-off-by: Xiaowei Ren <[email protected]> * typo fix Signed-off-by: Xiaowei Ren <[email protected]> * minor change Signed-off-by: Xiaowei Ren <[email protected]> * fix FlashAttention output sequence length Signed-off-by: Xiaowei Ren <[email protected]> * fix cu_seqlens_kv_per_step calculation Signed-off-by: Xiaowei Ren <[email protected]> * zero dQKV for ending padded tokens Signed-off-by: Xiaowei Ren <[email protected]> * zero dQKV tensors of FlashAttention Signed-off-by: Xiaowei Ren <[email protected]> * fix softmax_lse correction Signed-off-by: Xiaowei Ren <[email protected]> * remove padded tokens of KV to save comounication Signed-off-by: Xiaowei Ren <[email protected]> * do not need to zero dkv for FlashAttention any mroe Signed-off-by: Xiaowei Ren <[email protected]> * zero out tensors Signed-off-by: Xiaowei Ren <[email protected]> * remove redundant code Signed-off-by: Xiaowei Ren <[email protected]> * fix CP unit test Signed-off-by: Xiaowei Ren <[email protected]> * fix kv shape of cp test with thd format Signed-off-by: Xiaowei Ren <[email protected]> * update cp unit test Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add simple code framework Signed-off-by: Xiaowei Ren <[email protected]> * try not to have a separate CP function for SWA Signed-off-by: Xiaowei Ren <[email protected]> * backup some code change Signed-off-by: Xiaowei Ren <[email protected]> * back up code Signed-off-by: Xiaowei Ren <[email protected]> * clean up fwd implementation of SWAFuncWithCP Signed-off-by: Xiaowei Ren <[email protected]> * remove redundant code Signed-off-by: Xiaowei Ren <[email protected]> * code cleaning Signed-off-by: Xiaowei Ren <[email protected]> * fix assert info Signed-off-by: Xiaowei Ren <[email protected]> * reduce kv chunk concat overheads Signed-off-by: Xiaowei Ren <[email protected]> * minor change Signed-off-by: Xiaowei Ren <[email protected]> * make AttnFuncWithCP and SWAFuncWithCP have same API Signed-off-by: Xiaowei Ren <[email protected]> * add a docstring Signed-off-by: Xiaowei Ren <[email protected]> * preliminary implementation of SWAFuncWithCP forward seems working Signed-off-by: Xiaowei Ren <[email protected]> * fix output shape of SWAFuncWithCP Signed-off-by: Xiaowei Ren <[email protected]> * code refactoring for FlashAttention and add a code placeholder for bwd Signed-off-by: Xiaowei Ren <[email protected]> * use gather_along_first_dim Signed-off-by: Xiaowei Ren <[email protected]> * finish the preliminary implementation of bwd Signed-off-by: Xiaowei Ren <[email protected]> * remove redundant code Signed-off-by: Xiaowei Ren <[email protected]> * fix assert condition Signed-off-by: Xiaowei Ren <[email protected]> * add draft implementation of SWA+CP with FusedAttention Signed-off-by: Xiaowei Ren <[email protected]> * fix attention mask type of swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * code cleaning Signed-off-by: Xiaowei Ren <[email protected]> * add qkv_layout Signed-off-by: Xiaowei Ren <[email protected]> * add missing window_size argument Signed-off-by: Xiaowei Ren <[email protected]> * typo fix Signed-off-by: Xiaowei Ren <[email protected]> * fix kv shape of swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * bug and typo fix Signed-off-by: Xiaowei Ren <[email protected]> * fix dout shape Signed-off-by: Xiaowei Ren <[email protected]> * add multi stream in fwd of swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * save chunk_ids_to_kv_ag in fwd Signed-off-by: Xiaowei Ren <[email protected]> * add multi stream in bwd of swa+cp Signed-off-by: Xiaowei Ren <[email protected]> * minor fix to cp stream sync Signed-off-by: Xiaowei Ren <[email protected]> * rename AttnFuncWithCP Signed-off-by: Xiaowei Ren <[email protected]> * check if window size is None Signed-off-by: Xiaowei Ren <[email protected]> * fix docstring of AttnFuncWithCP Signed-off-by: Xiaowei Ren <[email protected]> * minor fix Signed-off-by: Xiaowei Ren <[email protected]> * add env var for users to choose KV ag or KV p2p Signed-off-by: Xiaowei Ren <[email protected]> * update cp tests Signed-off-by: Xiaowei Ren <[email protected]> * fix window size in cp unit test Signed-off-by: Xiaowei Ren <[email protected]> * fix pytest skip messages Signed-off-by: Xiaowei Ren <[email protected]> * add cp_comm_type into API Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * code cleaning Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add deterministic konb in cuDNN fused attn backend Signed-off-by: Xiaowei Ren <[email protected]> * pass fp8 and fp8_meta to attn_func_with_cp Signed-off-by: Xiaowei Ren <[email protected]> * assert only Fused Attn can support FP8+CP Signed-off-by: Xiaowei Ren <[email protected]> * remove redundant assert Signed-off-by: Xiaowei Ren <[email protected]> * add a fwd draft implementation of FP8 + CP Signed-off-by: Xiaowei Ren <[email protected]> * save fp8 and fp8_meta Signed-off-by: Xiaowei Ren <[email protected]> * assert sequence length divisible requirements Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove a redundant qkv_layout compute Signed-off-by: Xiaowei Ren <[email protected]> * if condition change Signed-off-by: Xiaowei Ren <[email protected]> * some typo fix Signed-off-by: Xiaowei Ren <[email protected]> * add support table of context parallelism Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo and code format fix Signed-off-by: Xiaowei Ren <[email protected]> * do not print multiple disabling messages Signed-off-by: Xiaowei Ren <[email protected]> * bug fix Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix aux_ctx_tensors of FP8 Signed-off-by: Xiaowei Ren <[email protected]> * bug fix Signed-off-by: Xiaowei Ren <[email protected]> * fix device in torch.arange and adjust code for the PR of MLA Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * commit code change for FP8+CP Signed-off-by: Xiaowei Ren <[email protected]> * commit more code change for FP8+CP Signed-off-by: Xiaowei Ren <[email protected]> * commit more fp8 code for FP8+CP Signed-off-by: Xiaowei Ren <[email protected]> * bug fixes Signed-off-by: Xiaowei Ren <[email protected]> * bug fix Signed-off-by: Xiaowei Ren <[email protected]> * cast merged CP results from FP32 to BF16 Signed-off-by: Xiaowei Ren <[email protected]> * typo fix Signed-off-by: Xiaowei Ren <[email protected]> * minor change Signed-off-by: Xiaowei Ren <[email protected]> * fix softmax_lse Signed-off-by: Xiaowei Ren <[email protected]> * fix some bugs of FP8 dkv exchange Signed-off-by: Xiaowei Ren <[email protected]> * typo fix Signed-off-by: Xiaowei Ren <[email protected]> * add FP8 unit test Signed-off-by: Xiaowei Ren <[email protected]> * fix typos and clean asserts Signed-off-by: Xiaowei Ren <[email protected]> * fix get_p2p_comm_info Signed-off-by: Xiaowei Ren <[email protected]> * fix dkv p2p exchange Signed-off-by: Xiaowei Ren <[email protected]> * minor fix Signed-off-by: Xiaowei Ren <[email protected]> * change FP8 dkv P2P to A2A Signed-off-by: Xiaowei Ren <[email protected]> * add FP8+CP unit test Signed-off-by: Xiaowei Ren <[email protected]> * typo fix Signed-off-by: Xiaowei Ren <[email protected]> * assert amax reduction is needed for FP8+CP Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove duplicated code Signed-off-by: Xiaowei Ren <[email protected]> * destroy process group in CP unit test Signed-off-by: Xiaowei Ren <[email protected]> * remove interval from fp8_recipe because it has been deprecated Signed-off-by: Xiaowei Ren <[email protected]> * try to fix the failed CP test with the latest CI pipeline Signed-off-by: Xiaowei Ren <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove redundant f before string Signed-off-by: Xiaowei Ren <[email protected]> * change META_O_CP Signed-off-by: Xiaowei Ren <[email protected]> --------- Signed-off-by: Xiaowei Ren <[email protected]> Co-authored-by: Charlene Yang <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xiaowei Ren <[email protected]>
NVIDIA · Aug 21, 2024 · 26c8fcc · 26c8fcc
1 parent 525de6c
commit 26c8fcc
Show file tree

Hide file tree

Showing 3 changed files with 592 additions and 263 deletions.
diff --git a/tests/pytorch/fused_attn/run_fused_attn_with_cp.py b/tests/pytorch/fused_attn/run_fused_attn_with_cp.py
@@ -2,15 +2,18 @@
 #
 # See LICENSE for license information.
 
-import os, sys
+import os, sys, logging
+from contextlib import nullcontext
 import torch
 import torch.distributed as dist
 from transformer_engine.pytorch.attention import DotProductAttention
 from transformer_engine.pytorch.attention import get_cu_seqlens_on_cp_rank
 import transformer_engine_torch as tex
 from test_fused_attn_with_cp import model_configs_flash_attn, model_configs_fused_attn
+from transformer_engine.pytorch.fp8 import fp8_autocast
+from transformer_engine.common.recipe import DelayedScaling
 
-dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16}
+dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.bfloat16}
 
 
 def run_dpa_with_cp(
@@ -57,6 +60,9 @@ def run_dpa_with_cp(
     assert rank in cp_comm_ranks
     cp_comm_group = dist.new_group(cp_comm_ranks, backend="nccl")
 
+    if dtype == "fp8":
+        fp8_recipe = DelayedScaling(fp8_dpa=True)
+
     # instantiate core attn module
     core_attn = DotProductAttention(
         config.num_heads,
@@ -171,18 +177,27 @@ def run_dpa_with_cp(
     # run core_attn without CP
     for x in [q, k, v]:
         x.requires_grad = True
-    out = core_attn(
-        q,
-        k,
-        v,
-        core_attention_bias_type=config.attn_bias_type,
-        core_attention_bias=bias,
-        cu_seqlens_q=cu_seqlens_q,
-        cu_seqlens_kv=cu_seqlens_kv,
-        cu_seqlens_q_padded=None if cu_seqlens_q_padded is None else cu_seqlens_q_padded[:-1],
-        cu_seqlens_kv_padded=None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1],
-    )
-    out.backward(dout)
+
+    if dtype == "fp8":
+        fp8_context = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=cp_comm_group)
+    else:
+        fp8_context = nullcontext()
+
+    with fp8_context:
+        out = core_attn(
+            q,
+            k,
+            v,
+            core_attention_bias_type=config.attn_bias_type,
+            core_attention_bias=bias,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            cu_seqlens_q_padded=None if cu_seqlens_q_padded is None else cu_seqlens_q_padded[:-1],
+            cu_seqlens_kv_padded=(
+                None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1]
+            ),
+        )
+        out.backward(dout)
 
     # run core_attn wit CP
     q_, k_, v_, dout_, *rest = [
@@ -226,31 +241,34 @@ def run_dpa_with_cp(
     core_attn.set_context_parallel_group(
         cp_comm_group, cp_comm_ranks, torch.cuda.Stream(), cp_comm_type
     )
-    out_ = core_attn(
-        q_,
-        k_,
-        v_,
-        core_attention_bias_type=config.attn_bias_type,
-        core_attention_bias=bias_,
-        cu_seqlens_q=cu_seqlens_q,
-        cu_seqlens_kv=cu_seqlens_kv,
-        cu_seqlens_q_padded=None if cu_seqlens_q_padded is None else cu_seqlens_q_padded[:-1],
-        cu_seqlens_kv_padded=None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1],
-    )
-    out_.backward(dout_)
+
+    if dtype == "fp8":
+        core_attn.reset_fp8_meta_tensors()
+        fp8_context = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=cp_comm_group)
+    else:
+        fp8_context = nullcontext()
+
+    with fp8_context:
+        out_ = core_attn(
+            q_,
+            k_,
+            v_,
+            core_attention_bias_type=config.attn_bias_type,
+            core_attention_bias=bias_,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            cu_seqlens_q_padded=None if cu_seqlens_q_padded is None else cu_seqlens_q_padded[:-1],
+            cu_seqlens_kv_padded=(
+                None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1]
+            ),
+        )
+        out_.backward(dout_)
 
     for x in [out_, q_.grad, k_.grad, v_.grad]:
         assert torch.all(~torch.isnan(x))
         assert torch.all(~torch.isinf(x))
 
     # compare results with and without CP
-    tols = dict(atol=5e-3, rtol=5e-3)
-    if dtype == "bf16":
-        if config.num_heads == config.num_gqa_groups:
-            tols = dict(atol=2.5e-2, rtol=2.5e-2)
-        else:
-            tols = dict(atol=3.5e-2, rtol=3.5e-2)
-
     if qkv_format == "bshd" or qkv_format == "sbhd":
         dq, dk, dv, out = [
             x.view(
@@ -309,32 +327,55 @@ def run_dpa_with_cp(
     else:
         assert False, f"{qkv_format} is an unsupported qkv_format!"
 
+    if dtype == "bf16":
+        if config.num_heads == config.num_gqa_groups:
+            tols = dict(atol=2.5e-2, rtol=2.5e-2)
+        else:
+            tols = dict(atol=3.5e-2, rtol=3.5e-2)
+    elif dtype == "fp16":
+        tols = dict(atol=5e-3, rtol=5e-3)
+    elif dtype == "fp8":
+        tols = dict(atol=5e-1, rtol=5e-1)
+        rmse_tol = 0.1
+    else:
+        assert False, f"{dtype} is an unsupported dtype!"
+
+    def _rmse(a, b):
+        return torch.sqrt((a - b).square().mean()).item()
+
+    def _error(a, b):
+        if dtype != "fp8":
+            torch.testing.assert_close(a, b, **tols)
+        else:
+            try:
+                torch.testing.assert_close(a, b, **tols)
+            except Exception as e:
+                logging.debug(e)
+
+            rmse = _rmse(a, b)
+            rmse_range = max(a.max().item(), b.max().item()) - min(a.min().item(), b.min().item())
+            assert (
+                rmse < rmse_tol * rmse_range
+            ), "RMSE {:.5f} is over tolerance {:.5f} ({:.5f} * {:.5f})".format(
+                rmse, rmse_tol * rmse_range, rmse_tol, rmse_range
+            )
+
     if qkv_format == "bshd":
-        torch.testing.assert_close(out_[:, 0], out[:, 0], **tols)
-        torch.testing.assert_close(dq_[:, 0], dq[:, 0], **tols)
-        torch.testing.assert_close(dk_[:, 0], dk[:, 0], **tols)
-        torch.testing.assert_close(dv_[:, 0], dv[:, 0], **tols)
-        torch.testing.assert_close(out_[:, 1], out[:, 1], **tols)
-        torch.testing.assert_close(dq_[:, 1], dq[:, 1], **tols)
-        torch.testing.assert_close(dk_[:, 1], dk[:, 1], **tols)
-        torch.testing.assert_close(dv_[:, 1], dv[:, 1], **tols)
+        for a, b in zip([out_, dq_, dk_, dv_], [out, dq, dk, dv]):
+            _error(a[:, 0], b[:, 0])
+            _error(a[:, 1], b[:, 1])
     elif qkv_format == "sbhd":
-        torch.testing.assert_close(out_[0], out[0], **tols)
-        torch.testing.assert_close(dq_[0], dq[0], **tols)
-        torch.testing.assert_close(dk_[0], dk[0], **tols)
-        torch.testing.assert_close(dv_[0], dv[0], **tols)
-        torch.testing.assert_close(out_[1], out[1], **tols)
-        torch.testing.assert_close(dq_[1], dq[1], **tols)
-        torch.testing.assert_close(dk_[1], dk[1], **tols)
-        torch.testing.assert_close(dv_[1], dv[1], **tols)
+        for a, b in zip([out_, dq_, dk_, dv_], [out, dq, dk, dv]):
+            _error(a[0], b[0])
+            _error(a[1], b[1])
     elif qkv_format == "thd":
-        torch.testing.assert_close(out_, out, **tols)
-        torch.testing.assert_close(dq_, dq, **tols)
-        torch.testing.assert_close(dk_, dk, **tols)
-        torch.testing.assert_close(dv_, dv, **tols)
+        for a, b in zip([out_, dq_, dk_, dv_], [out, dq, dk, dv]):
+            _error(a, b)
     else:
         assert False, f"{qkv_format} is an unsupported qkv_format!"
 
+    dist.destroy_process_group()
+
 
 def main(**kwargs):
     run_dpa_with_cp(**kwargs)

diff --git a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
@@ -90,7 +90,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
 
 @pytest.mark.skipif(get_cudnn_version() < (8, 9, 7), reason="cuDNN 8.9.7+ is required.")
 @pytest.mark.skipif(get_device_compute_capability() < (8, 0), reason="CP tests require sm80+.")
-@pytest.mark.parametrize("dtype", ["bf16", "fp16"])
+@pytest.mark.parametrize("dtype", ["bf16", "fp16", "fp8"])
 @pytest.mark.parametrize("model", model_configs_fused_attn.keys())
 @pytest.mark.parametrize("qkv_format", ["bshd", "sbhd", "thd"])
 @pytest.mark.parametrize("cp_comm_type", ["p2p", "all_gather"])
@@ -121,8 +121,16 @@ def test_cp_with_fused_attention(dtype, model, qkv_format, cp_comm_type):
         )
     if config.window_size != (-1, 0) and config.window_size != (-1, -1):
         pytest.skip(
-            f"Fused attention does not support sliding window attention + context parallelism yet!"
+            "Fused attention does not support sliding window attention + context parallelism yet!"
+        )
+    if cp_comm_type == "all_gather" and dtype == "fp8":
+        pytest.skip(
+            "CP implementation with KV all-gather does not support FP8 + context parallelism yet!"
         )
+    if dtype == "fp8" and qkv_format == "thd":
+        pytest.skip("FP8 attention cannot work with THD format yet!")
+    if dtype == "fp8" and config.attn_bias_type != "no_bias":
+        pytest.skip("FP8 attention cannot work with bias yet!")
 
     subprocess.run(
         get_bash_arguments(