Optimized the execution interval time between cuda kernels caused by view and memcopy (#5390)

* opt_view_and_memcopy * fix bugs in ci * fix ci bugs * update benchmark scripts * fix ci bugs
2025-09-06 19:40:28 +00:00 · 2024-02-21 13:23:57 +08:00
parent 730103819d
commit 2a718c8be8
8 changed files with 141 additions and 55 deletions
--- a/colossalai/inference/modeling/policy/nopadding_llama.py
+++ b/colossalai/inference/modeling/policy/nopadding_llama.py
@@ -29,8 +29,10 @@ except:
 def get_triton_rmsnorm_forward():
    if HAS_TRITON_RMSNORM:

-        def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_output: torch.Tensor):
-            return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output)
+        def _triton_rmsnorm_forward(
+            self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_output: torch.Tensor, residual: torch.Tensor = None
+        ):
+            return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual)

        return _triton_rmsnorm_forward
    else: