DeepLink-org · yangbofun · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp
@@ -39,17 +39,16 @@ void extAdamW(at::Tensor& param, at::Tensor& exp_avg, at::Tensor& exp_avg_sq,
             beta1, beta2, epsilon, weight_decay, step, amsgrad);
 }
 
-auto extRmsNorm(at::Tensor& output, at::Tensor& inv_rms,
+void extRmsNorm(at::Tensor& output, at::Tensor& inv_rms,
                 const at::Tensor& input,
                 const OptionalIntArray& normalized_shape,
                 const at::Tensor& weight, const at::Tensor& bias, double eps) {
   at::OptionalIntArrayRef normalized_shape_at = *normalized_shape;
   callDiopi(diopiRMSNorm, output, inv_rms, input, normalized_shape_at, weight,
             bias, eps);
-  return std::make_tuple(std::move(output), std::move(inv_rms));
 }
 
-auto extRmsNormBackward(at::Tensor& grad_input, at::Tensor& grad_weight,
+void extRmsNormBackward(at::Tensor& grad_input, at::Tensor& grad_weight,
                         at::Tensor& grad_bias, const at::Tensor& grad_output,
                         const at::Tensor& input, const at::Tensor& weight,
                         const at::Tensor& bias, const at::Tensor& inv_rms,
@@ -58,8 +57,6 @@ auto extRmsNormBackward(at::Tensor& grad_input, at::Tensor& grad_weight,
   callDiopi(diopiRMSNormBackward, grad_input, grad_weight, grad_bias,
             grad_output, input, weight, bias, inv_rms, normalized_shape_at,
             eps);
-  return std::make_tuple(std::move(grad_input), std::move(grad_weight),
-                         std::move(grad_bias));
 }
 
 void extApplyRotary(at::Tensor& output, const at::Tensor& input,

diff --git a/deeplink_ext/ascend_speed/__init__.py b/deeplink_ext/ascend_speed/__init__.py
@@ -1,5 +1,14 @@
 from .rotary_embedding import apply_rotary, RotaryEmbedding
 from .adamw import adamw
 from .scaled_masked_softmax import ScaledMaskedSoftmax
+from .rms_norm import RMSNorm
+from .flash_attention import FlashSelfAttention
 
-__all__ = ["apply_rotary", "RotaryEmbedding", "adamw", "ScaledMaskedSoftmax"]
+__all__ = [
+    "apply_rotary",
+    "RotaryEmbedding",
+    "adamw",
+    "ScaledMaskedSoftmax",
+    "RMSNorm",
+    "FlashSelfAttention",
+]
diff --git a/deeplink_ext/ascend_speed/rms_norm.py b/deeplink_ext/ascend_speed/rms_norm.py
@@ -0,0 +1,47 @@
+import torch
+import deeplink_ext.cpp_extensions as ext
+
+
+assert hasattr(ext, "rms_norm") and hasattr(ext, "rms_norm_backward")
+
+
+class RMSNorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, hidden_states, weight, eps):
+        bias = torch.Tensor().cuda()
+        output = torch.empty_like(hidden_states)
+        input_dtype = hidden_states.dtype
+        acc_dtype = (
+            torch.float32
+            if input_dtype in [torch.bfloat16, torch.float16]
+            else input_dtype
+        )
+        inv_rms = torch.empty(
+            list(hidden_states.shape[:-1]) + [1],
+            dtype=acc_dtype,
+            device=hidden_states.device,
+        )
+        ext.rms_norm(output, inv_rms, hidden_states, weight.shape, weight, bias, eps)
+        ctx.save_for_backward(hidden_states, inv_rms, weight, bias)
+        ctx.eps = eps
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hidden_states, inv_rms, weight, bias = ctx.saved_tensors
+        grad_input = torch.empty_like(hidden_states)
+        grad_weight = torch.empty_like(weight)
+        grad_bias = torch.empty_like(bias)
+        ext.rms_norm_backward(
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_output,
+            hidden_states,
+            weight,
+            bias,
+            inv_rms,
+            weight.shape,
+            ctx.eps,
+        )
+        return grad_input, grad_weight, None, None
diff --git a/deeplink_ext/internlm_ops/__init__.py b/deeplink_ext/internlm_ops/__init__.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2024, DeepLink.
 
-from . import mha
-
 
 _not_impl = "[deeplink_ext] {op_name} is not implemented in diopi. Falling back to the slower torch implementation."