Optimized the execution interval time between cuda kernels caused by view and memcopy (#5390)

* opt_view_and_memcopy * fix bugs in ci * fix ci bugs * update benchmark scripts * fix ci bugs
2025-09-06 11:32:10 +00:00 · 2024-02-21 13:23:57 +08:00
parent 730103819d
commit 2a718c8be8
8 changed files with 141 additions and 55 deletions
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -2,7 +2,6 @@
 from typing import List, Optional, Tuple

 import torch
-from torch.nn import Parameter
 from transformers.models.llama.modeling_llama import (
    LlamaAttention,
    LlamaConfig,
@@ -82,19 +81,21 @@ def llama_model_forward(

    if batch.is_prompts:
        output_tensor = torch.zeros(
-            (sequence_lengths.sum().item(), batch.num_heads, batch.head_dim), dtype=batch.dtype, device=batch.device
+            (sequence_lengths.sum().item(), batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
        )
    else:
        output_tensor = torch.zeros(
-            (batch_size, batch.num_heads, batch.head_dim), dtype=batch.dtype, device=batch.device
+            (batch_size, batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
        )
    sm_scale = 1.0 / (batch.head_dim**0.5)

    norm_output = torch.empty_like(hidden_states)
+    residual = None

    for layer_id, decoder_layer in enumerate(self.layers):
-        hidden_states = decoder_layer(
+        hidden_states, residual = decoder_layer(
            hidden_states,
+            residual=residual,
            block_tables=block_tables,
            k_cache=k_caches[layer_id],
            v_cache=v_caches[layer_id],
@@ -111,8 +112,9 @@ def llama_model_forward(
    if batch.is_prompts:
        last_token_indexs = sequence_lengths.cumsum(dim=-1)
        hidden_states = hidden_states[last_token_indexs - 1].contiguous()
+        residual = residual[last_token_indexs - 1].contiguous()
        norm_output = torch.empty_like(hidden_states)
-    hidden_states = self.norm(hidden_states, norm_output)
+    hidden_states, _ = self.norm(hidden_states, norm_output, residual)

    return hidden_states

@@ -120,6 +122,7 @@ def llama_model_forward(
 def llama_decoder_layer_forward(
    self: LlamaDecoderLayer,
    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
    block_tables: torch.Tensor = None,
    k_cache: torch.Tensor = None,
    v_cache: torch.Tensor = None,
@@ -136,6 +139,7 @@ def llama_decoder_layer_forward(

    Args:
        hidden_states (torch.Tensor): input to the layer of shape [token_num, embed_dim].
+        residual (torch.Tensor): shape [token_num, embed_dim], used to be added to hidden_states in out_proj.
        block_tables (torch.Tensor, optional): A 2D tensor of shape [batch_size, max_blocks_per_sequence],
            storing mapping of token_position_id -> block_id. Defaults to None.
        k_cache (torch.Tensor, optional): It holds the GPU memory for the key cache. Defaults to None.
@@ -151,12 +155,10 @@ def llama_decoder_layer_forward(
        sm_scale (int, optional): Used for flash attention. Defaults to None.
    """

-    residual = hidden_states
-    hidden_states = self.input_layernorm(hidden_states, norm_output)
+    hidden_states, residual = self.input_layernorm(hidden_states, norm_output, residual)
    # Self Attention
    hidden_states = self.self_attn(
        hidden_states=hidden_states,
-        residual=residual,
        block_tables=block_tables,
        k_cache=k_cache,
        v_cache=v_cache,
@@ -170,11 +172,10 @@ def llama_decoder_layer_forward(
    )

    # Fully Connected
-    residual = hidden_states
-    hidden_states = self.post_attention_layernorm(hidden_states, norm_output)
-    hidden_states = self.mlp(hidden_states, residual)
+    hidden_states, residual = self.post_attention_layernorm(hidden_states, norm_output, residual)
+    hidden_states = self.mlp(hidden_states)

-    return hidden_states
+    return hidden_states, residual


 class NopadLlamaAttention(LlamaAttention):
@@ -198,16 +199,18 @@ class NopadLlamaAttention(LlamaAttention):
            attn_oproj_w (torch.Tensor, optional): The transposed o_proj weight. Defaults to None.
        """
        super().__init__(config, layer_idx)
-        self.q_proj.weight = Parameter(attn_qproj_w, requires_grad=False)
-        self.k_proj.weight = Parameter(attn_kproj_w, requires_grad=False)
-        self.v_proj.weight = Parameter(attn_vproj_w, requires_grad=False)
-        self.o_proj.weight = Parameter(attn_oproj_w, requires_grad=False)
+        self.q_proj_weight = attn_qproj_w
+        self.k_proj_weight = attn_kproj_w
+        self.v_proj_weight = attn_vproj_w
+        self.o_proj_weight = attn_oproj_w
+
        if self.num_heads == self.num_key_value_heads:
-            qkv_weight_list = [self.q_proj.weight, self.k_proj.weight, self.v_proj.weight]
+            qkv_weight_list = [self.q_proj_weight, self.k_proj_weight, self.v_proj_weight]
            self.qkv_weight = torch.stack(qkv_weight_list, dim=0)
-            self.q_proj = None
-            self.k_proj = None
-            self.v_proj = None
+
+        self.q_proj = None
+        self.k_proj = None
+        self.v_proj = None

    @staticmethod
    def from_native_module(module: LlamaAttention, *args, **kwargs) -> LlamaAttention:
@@ -239,7 +242,6 @@ class NopadLlamaAttention(LlamaAttention):
    def forward(
        self,
        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
        block_tables: torch.Tensor = None,
        k_cache: torch.Tensor = None,
        v_cache: torch.Tensor = None,
@@ -254,7 +256,6 @@ class NopadLlamaAttention(LlamaAttention):
        """
        Args:
            hidden_states (torch.Tensor): input to the layer of shape [token_num, embed_dim].
-            residual (torch.Tensor): shape [token_num, embed_dim], used to be added to hidden_states in out_proj.
            block_tables (torch.Tensor, optional): A 2D tensor of shape [batch_size, max_blocks_per_sequence],
                storing mapping of token_position_id -> block_id. Defaults to None.
            k_cache (torch.Tensor, optional): It holds the GPU memory for the key cache. Defaults to None.
@@ -270,9 +271,9 @@ class NopadLlamaAttention(LlamaAttention):
        """

        if self.num_heads != self.num_key_value_heads:
-            query_states = torch.mm(hidden_states, self.q_proj.weight).view(-1, self.num_heads, self.head_dim)
-            key_states = torch.mm(hidden_states, self.k_proj.weight).view(-1, self.num_key_value_heads, self.head_dim)
-            value_states = torch.mm(hidden_states, self.v_proj.weight).view(-1, self.num_key_value_heads, self.head_dim)
+            query_states = torch.mm(hidden_states, self.q_proj_weight).view(-1, self.num_heads, self.head_dim)
+            key_states = torch.mm(hidden_states, self.k_proj_weight).view(-1, self.num_key_value_heads, self.head_dim)
+            value_states = torch.mm(hidden_states, self.v_proj_weight).view(-1, self.num_key_value_heads, self.head_dim)
        else:
            # fused qkv
            token_nums = hidden_states.size(0)
@@ -324,8 +325,7 @@ class NopadLlamaAttention(LlamaAttention):
                sm_scale=sm_scale,
            )

-        attn_output = attn_output.view(-1, self.hidden_size)
-        attn_output = torch.addmm(residual, attn_output, self.o_proj.weight)
+        attn_output = torch.mm(attn_output, self.o_proj_weight)

        return attn_output

@@ -348,10 +348,11 @@ class NopadLlamaMLP(LlamaMLP):
            mlp_dproj_w (torch.Tensor, optional): The transposed down_proj weight. Defaults to None.
        """
        super().__init__(config)
-        self.gate_up_weight = Parameter(torch.stack([mlp_gproj_w, mlp_uproj_w], dim=0), requires_grad=False)
-        self.down_proj.weight = Parameter(mlp_dproj_w, requires_grad=False)
+        self.gate_up_weight = torch.stack([mlp_gproj_w, mlp_uproj_w], dim=0)
+        self.down_proj_weight = mlp_dproj_w
        self.gate_proj = None
        self.up_proj = None
+        self.down_proj = None

    @staticmethod
    def from_native_module(module: LlamaMLP, *args, **kwargs) -> LlamaMLP:
@@ -375,14 +376,13 @@ class NopadLlamaMLP(LlamaMLP):

        return mlp_layer

-    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """
        Args:
            hidden_states (torch.Tensor): input to the layer of shape [token_num, embed_dim].
-            residual (torch.Tensor): shape [token_num, embed_dim], used to be added to hidden_states in down_proj.
        """
        hidden_states = hidden_states.expand(2, -1, -1)
        gate_up_proj_out = torch.bmm(hidden_states, self.gate_up_weight)
        act_out = torch.nn.functional.silu(gate_up_proj_out[0], inplace=True)
        tmp_out = act_out * gate_up_proj_out[1]
-        return torch.addmm(residual, tmp_out, self.down_proj.weight)
+        return torch.mm(tmp_out, self.down_proj_weight)
--- a/colossalai/inference/modeling/policy/nopadding_llama.py
+++ b/colossalai/inference/modeling/policy/nopadding_llama.py
@@ -29,8 +29,10 @@ except:
 def get_triton_rmsnorm_forward():
    if HAS_TRITON_RMSNORM:

-        def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_output: torch.Tensor):
-            return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output)
+        def _triton_rmsnorm_forward(
+            self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_output: torch.Tensor, residual: torch.Tensor = None
+        ):
+            return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual)

        return _triton_rmsnorm_forward
    else: