[feat]: improve overlap performance

zxdukki · zxdukki · commit 9053dd1bb230 · 2025-05-27T22:21:22.000+08:00
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -575,7 +575,17 @@ def _forward_prefill(
             )
         attn_output = attn_output.reshape(
             [num_tokens, self.num_heads * self.v_head_dim])
-        return self.o_proj(attn_output)[0]
+        
+        # A better way is to modify the communication ops or RowParallel Layer in vllm;
+        from vllm_ascend.multistream.context import get_multistream_comm_context
+        current_ms_metadata = get_multistream_comm_context()  
+        if current_ms_metadata is None:
+            return self.o_proj(attn_output)[0]
+        else: 
+            current_ms_metadata.before_comm_event.record()
+            with torch.npu.stream(current_ms_metadata.comm_stream):
+                current_ms_metadata.before_comm_event.wait()
+                return self.o_proj(attn_output)[0]
 
     def exec_kv(
         self,
@@ -675,7 +685,16 @@ def _forward_decode(
                 context_lens=attn_metadata.decode.seq_lens,  # type:ignore
                 mla_vheadsize=self.kv_lora_rank,
                 out=attn_output)
-        return self._v_up_proj_and_o_proj(attn_output)
+        from vllm_ascend.multistream.context import get_multistream_comm_context
+        current_ms_metadata = get_multistream_comm_context()  
+        if current_ms_metadata is None:
+            return self._v_up_proj_and_o_proj(attn_output)
+        else: 
+            current_ms_metadata.before_comm_event.record()
+            with torch.npu.stream(current_ms_metadata.comm_stream):
+                current_ms_metadata.before_comm_event.wait()
+                return self._v_up_proj_and_o_proj(attn_output)
+
 
     def forward(
         self,
@@ -800,24 +819,20 @@ def forward(
                 key_cache=kv_cache,
                 slot_indices=attn_metadata.slot_mapping.flatten())
         if has_prefill:
-            # FIX: aicore move/copy should be also placed on the comm stream in dbo, 
-            # otherwise it may affect the accuracy or disturb the overlap of next stage
-            # TODO: use an elegant way here to avoid it
+            # FIX: aicore move should be also placed on the comm stream in dbo, 
+            # otherwise it may affect the accuracy 
+            # TODO: use an elegant way to overlap
             from vllm_ascend.multistream.context import get_multistream_comm_context
+            output_prefill = self._forward_prefill(
+                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
+                attn_metadata)
             current_ms_metadata = get_multistream_comm_context()
-            if current_ms_metadata is None:
-                output[num_decode_tokens:] = self._forward_prefill(
-                    prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                    attn_metadata)
-            else:
-                current_ms_metadata.before_comm_event.record()
+            if current_ms_metadata is not None:
                 with torch.npu.stream(current_ms_metadata.comm_stream):
-                    current_ms_metadata.before_comm_event.wait()
-                    output[num_decode_tokens:] = self._forward_prefill(
-                        prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                        attn_metadata)
+                    output[num_decode_tokens:] = output_prefill
                     current_ms_metadata.after_comm_event.record()
-
+            else:
+                output[num_decode_tokens:] = output_prefill
 
 
         if has_decode:
@@ -826,21 +841,17 @@ def forward(
                                             decode_k_nope, decode_k_pe,
                                             kv_cache, attn_metadata)
             else:
-
                 from vllm_ascend.multistream.context import get_multistream_comm_context
-                current_ms_metadata = get_multistream_comm_context()
-                if current_ms_metadata is None:
-                    output[:num_decode_tokens] = self._forward_decode(
-                        decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
-                        kv_cache, attn_metadata)
-                else:
-                    current_ms_metadata.before_comm_event.record()
-                    with torch.npu.stream(current_ms_metadata.comm_stream):
-                        current_ms_metadata.before_comm_event.wait()
-                        output[:num_decode_tokens] = self._forward_decode(
-                            decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
-                            kv_cache, attn_metadata)
-                        current_ms_metadata.after_comm_event.record()
+                output_decode = self._forward_decode(
+                     decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
+                     kv_cache, attn_metadata)
+                current_ms_metadata = get_multistream_comm_context()   
+            if current_ms_metadata is not None:
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    output[:num_decode_tokens] = output_decode
+                    current_ms_metadata.after_comm_event.record()
+            else:
+                output[:num_decode_tokens] = output_decode
 
             
         return output_padded
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -149,6 +149,50 @@ def forward(self, x):
         x, _ = self.down_proj(x)
         return x
 
+    def _forward_ms_mlp(self, x):
+        current_ms_metadata = get_multistream_comm_context()
+        assert current_ms_metadata is not None
+        if self.is_dynamic_quant:
+            x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
+            x = torch_npu.npu_quant_matmul(
+                x,
+                self.gate_up_proj.weight,
+                self.gate_up_proj.weight_scale,
+                output_dtype=torch.int32,
+            )
+            x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
+                x=x,
+                weight_scale=self.gate_up_proj.weight_scale_fp32,
+                activation_scale=dynamic_scale,
+                bias=None,
+                quant_scale=None,
+                quant_offset=None,
+                group_index=None,
+                activate_left=True,
+                quant_mode=1)
+            x = torch_npu.npu_quant_matmul(
+                x,
+                self.down_proj.weight,
+                self.down_proj.weight_scale,
+                pertoken_scale=dynamic_scale,
+                output_dtype=torch.bfloat16,
+            )
+            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
+                current_ms_metadata.before_comm_event.record()
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    current_ms_metadata.before_comm_event.wait()
+                    x = tensor_model_parallel_all_reduce(x)
+                    current_ms_metadata.after_comm_event.record()
+            return x
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        current_ms_metadata.before_comm_event.record()
+        with torch.npu.stream(current_ms_metadata.comm_stream):
+            current_ms_metadata.before_comm_event.wait()
+            x, _ = self.down_proj(x)
+            current_ms_metadata.after_comm_event.record()
+        return x
+
 
 class CustomDeepseekV2MoE(nn.Module):
 
@@ -282,7 +326,7 @@ def _forward_ms_op_shared_expert(
             self,
             hidden_states: torch.Tensor,
         ):
-        shared_output = self.shared_experts(hidden_states)
+        shared_output = self.shared_experts._forward_ms_mlp(hidden_states)
         return shared_output
     
     def _forward_ms_op_gate(
@@ -293,7 +337,7 @@ def _forward_ms_op_gate(
         router_logits, _ = self.gate(hidden_states)
         return router_logits
     
-    def _forward_ms_op_tp_allreduce(
+    def _forward_ms_op_tp_allgather(
             self,
             hidden_states: torch.Tensor,
             shared_output: torch.Tensor,
@@ -303,13 +347,26 @@ def _forward_ms_op_tp_allreduce(
             ):
         
         if self.tp_size > 1:
-            dist.all_gather(list(chunk_hidden_states), hidden_states,
-                            self.tp_group)
-            final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
-            #if num_tokens < self.tp_size:
-            #    final_hidden_states = final_hidden_states[:num_tokens]
-            if num_tokens > 0:
-                final_hidden_states = final_hidden_states[:-num_tokens]
+            current_ms_metadata = get_multistream_comm_context()
+            if current_ms_metadata is None:
+                dist.all_gather(list(chunk_hidden_states), hidden_states,
+                                self.tp_group)
+                final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
+                #if num_tokens < self.tp_size:
+                #    final_hidden_states = final_hidden_states[:num_tokens]
+                if num_tokens > 0:
+                    final_hidden_states = final_hidden_states[:-num_tokens]
+            else:
+                current_ms_metadata.before_comm_event.record()
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    dist.all_gather(list(chunk_hidden_states), hidden_states,
+                                self.tp_group)
+                    final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
+                    #if num_tokens < self.tp_size:
+                    #    final_hidden_states = final_hidden_states[:num_tokens]
+                    if num_tokens > 0:
+                        final_hidden_states = final_hidden_states[:-num_tokens]
+
         else:
             final_hidden_states = hidden_states
 
@@ -650,25 +707,24 @@ def _forward_ms_layer(
 
                 # input layernorm
                 hidden_states[i], residual[i] = self._forward_ms_op_input_layernorm(hidden_states[i], residual[i])
-                # attention and tp allreducea
+                # attention and tp allreduce
                 hidden_states[i], residual[i] = self._forward_ms_op_attn(positions[i], hidden_states[i], residual[i], kv_cache, attn_metadata[i])
         
         ''' block 3 : shared experts
             if there is an allreduce ops in shared expert, we can overlap it with the computation of the 
             shared expert for next microbatch or moe gating
         '''
         for i in range(num_micro_batchs):
+            ms_metadata.try_wait_event(layer_index, i, MSEventKey.ATTN_AR_FINISH)
             context = MultiStreamStepMetadata(
                 comm_stream=ms_metadata.communicate_stream,
                 before_comm_event=ms_metadata.ms_events[layer_index][i][MSEventKey.MOE_SE_COMP_FINISH],
                 after_comm_event=ms_metadata.ms_events[layer_index][i][MSEventKey.MOE_SE_COMM_FINISH],
             )
             with set_multistream_context(context, i):
                 # compute shared expert after finishing ATTN AR
-                ms_metadata.try_wait_event(layer_index, i, MSEventKey.ATTN_AR_FINISH)
                 hidden_states[i], residual[i] = self._forward_ms_op_post_attn_layernorm(hidden_states[i], residual[i])
 
-
                 num_token, hidden_dim = hidden_states[i].shape
                 hidden_states[i] = hidden_states[i].view(-1, hidden_dim)
                 #num_tokens.append(num_token)
@@ -740,10 +796,14 @@ def _forward_ms_layer(
                 before_comm_event=ms_metadata.ms_events[layer_index][i][MSEventKey.FFN_COM_FINISH],
                 after_comm_event=ms_metadata.ms_events[layer_index][i][MSEventKey.MOE_AFTER_COMM],
             )
-            with set_multistream_context(context, i):
+            context.before_comm_event.record()
+            with torch.npu.stream(ms_metadata.communicate_stream):
+            #with set_multistream_context(context, i):
+                context.before_comm_event.wait()
                 if self.mlp.experts.reduce_results and (self.mlp.experts.tp_size > 1 or self.mlp.experts.ep_size > 1):
                     hidden_states[i] = tensor_model_parallel_all_reduce(
                         hidden_states[i])
+                context.after_comm_event.record()
             # check here
             hidden_states[i] = hidden_states[i] * self.mlp.routed_scaling_factor
             context = MultiStreamStepMetadata(
@@ -752,7 +812,7 @@ def _forward_ms_layer(
                     after_comm_event=ms_metadata.ms_events[layer_index][i][MSEventKey.FFN_AR_FINISH],
                 )
             with set_multistream_context(context, i):
-                hidden_states[i] = self.mlp._forward_ms_op_tp_allreduce(hidden_states[i], shared_outputs[i], chunk_hidden_states[i], num_tokens[i], hidden_dims[i])
+                hidden_states[i] = self.mlp._forward_ms_op_tp_allgather(hidden_states[i], shared_outputs[i], chunk_hidden_states[i], num_tokens[i], hidden_dims[i])
             with torch.npu.stream(ms_metadata.communicate_stream):
                 # last
                 if isinstance(
@@ -764,6 +824,7 @@ def _forward_ms_layer(
                 # The scaling of DeepseekV2MOE output would be done in the forward
                 # of DeepseekV2MOE
                     hidden_states[i] *= 1. / self.routed_scaling_factor
+                context.after_comm_event.record()
         return hidden_states, residual
     # should split ops in Decoder Layer
     def _forward_ms_op_input_layernorm(
@@ -934,7 +995,7 @@ def forward(
     def can_run_ms(self):
         # currently we only enable prefill overlap
         attn_metadata = get_forward_context().attn_metadata
-        dp_metadata = get_forward_context().dp_metadata
+        # dp_metadata = get_forward_context().dp_metadata
         # profile run
         if self.multistream_config is None or attn_metadata is None:
             return False
@@ -944,16 +1005,17 @@ def can_run_ms(self):
         # disable decode dbo
         if attn_metadata.num_prefills == 0:
             return False
-        num_microbatchs = self.multistream_config.num_micro_batches
         # check whether there is a dp rank that not use dual batch
-        '''if dp_metadata is not None:
+        '''
+        num_microbatchs = self.multistream_config.num_micro_batches
+        if dp_metadata is not None:
             for i in range(num_microbatchs):
                 cu_tokens = dp_metadata.cu_dbo_tokens_across_dp_cpu[i]
                 if torch.any(cu_tokens == 0).item():
                     return False
+        '''
         [token_index, seq_index] = compute_split_seq_index(attn_metadata.query_lens,
-                                                           attn_metadata.attn_state, attn_metadata.num_decode_tokens)
-        '''         
+                                                           attn_metadata.attn_state, attn_metadata.num_decode_tokens)         
         if token_index == 0 or seq_index == 0 or seq_index == len(attn_metadata.query_lens):
             return False
         # check whether the total tokens exceed the threshold
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -37,8 +37,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 
-from vllm_ascend.multistream.base import MSEventKey
-from vllm_ascend.multistream.metadata import MultiStreamStepMetadata, MultiStreamMetadata
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group