[misc] refactor launch API and tensor constructor (#5666)

* [misc] remove config arg from initialize

* [misc] remove old tensor contrusctor

* [plugin] add npu support for ddp

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [devops] fix doc test ci

* [test] fix test launch

* [doc] update launch doc

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Hongxin Liu
2024-04-29 10:40:11 +08:00
committed by GitHub
parent 91fa553775
commit 7f8b16635b
223 changed files with 294 additions and 403 deletions

View File

@@ -10,6 +10,7 @@ from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
from colossalai.cluster import DistCoordinator
from colossalai.interface import ModelWrapper, OptimizerWrapper
from colossalai.quantization import BnbQuantizationConfig, quantize_model
from colossalai.utils import get_current_device
from .dp_plugin_base import DPPluginBase
@@ -203,7 +204,7 @@ class TorchDDPPlugin(DPPluginBase):
return True
def supported_devices(self) -> List[str]:
return ["cuda"]
return ["cuda", "npu"]
def configure(
self,
@@ -214,7 +215,7 @@ class TorchDDPPlugin(DPPluginBase):
lr_scheduler: Optional[LRScheduler] = None,
) -> Tuple[nn.Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]:
# cast model to cuda
model = model.cuda()
model = model.to(get_current_device())
# convert model to sync bn
model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None)