mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-06 19:40:28 +00:00
[feat] cuda graph support and refactor non-functional api
This commit is contained in:
@@ -1,5 +1,3 @@
|
||||
import torch
|
||||
|
||||
try:
|
||||
import triton
|
||||
import triton.language as tl
|
||||
@@ -94,7 +92,10 @@ if HAS_TRITON:
|
||||
|
||||
def rms_layernorm(x, weight, eps, norm_output=None, residual=None):
|
||||
# allocate output
|
||||
y = torch.empty_like(x) if norm_output is None else norm_output
|
||||
# y = torch.empty_like(x) if norm_output is None else norm_output
|
||||
y = (
|
||||
x * 0 if norm_output is None else norm_output
|
||||
) # to make the operation non-functional, store y as the intermediate activation
|
||||
M, N = x.shape
|
||||
# Less than 64KB per feature: enqueue fused kernel
|
||||
MAX_FUSED_SIZE = 65536 // x.element_size()
|
||||
|
Reference in New Issue
Block a user