diff --git a/colossalai/utils/model/colo_init_context.py b/colossalai/utils/model/colo_init_context.py index 7a9b3ff25..851543e4a 100644 --- a/colossalai/utils/model/colo_init_context.py +++ b/colossalai/utils/model/colo_init_context.py @@ -36,8 +36,13 @@ def _convert_to_coloparam(param: torch.nn.Parameter, return param # detaching tensor is necessary for optimizers. requires_grad = param.requires_grad - # param is the global tensor. - colo_param = ColoParameter(param.to(device=device, dtype=dtype), requires_grad=requires_grad) + + if param.device.type == 'meta': + raise NotImplemented( + "ColoInitContext is initializing a model with meta parameters! This is not allowed right now!") + else: + # param is the global tensor. + colo_param = ColoParameter(param.to(device=device, dtype=dtype), requires_grad=requires_grad) # if default_shard_plan exists, shard the param during initialization. # This can reduce the model size after initialization.