[setup] support pre-build and jit-build of cuda kernels (#2374)

* [setup] support pre-build and jit-build of cuda kernels

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
This commit is contained in:
Frank Lee
2023-01-06 20:50:26 +08:00
committed by GitHub
parent 12c8bf38d7
commit 40d376c566
36 changed files with 414 additions and 390 deletions

View File

@@ -18,11 +18,15 @@ from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARA
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.kernel import fused_optim
from colossalai.tensor import ColoParameter, ProcessGroup
from .multi_tensor_apply import multi_tensor_applier
try:
from colossalai._C import fused_optim
except:
fused_optim = None
def print_rank_0(msg: str, logger=None):
"""Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
@@ -123,6 +127,13 @@ def is_model_parallel_parameter(p):
def _calc_l2_norm(grads):
# we should not
global fused_optim
if fused_optim is None:
from colossalai.kernel.op_builder import FusedOptimBuilder
fused_optim = FusedOptimBuilder().load()
norm = 0.0
if len(grads) > 0:
dummy_overflow_buf = torch.cuda.IntTensor([0])

View File

@@ -14,7 +14,6 @@ class MultiTensorApply(object):
def __init__(self, chunk_size):
try:
from colossalai.kernel import fused_optim
MultiTensorApply.available = True
self.chunk_size = chunk_size
except ImportError as err: