[Device]Support npu (#6159)

* support npu * support pretrain support pretrain fix * support lora fix fix * support chatglm fix fxi fix [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci fix fix [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci fix [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci fix fix fix * Update train.py * Update train.py * fix * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-09-01 09:07:51 +00:00 · 2024-12-17 15:42:39 +08:00
parent e994c64568
commit aaafb38851
18 changed files with 295 additions and 152 deletions
--- a/examples/language/bert/benchmark_utils.py
+++ b/examples/language/bert/benchmark_utils.py
@@ -9,6 +9,7 @@ from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 from tqdm import tqdm

+from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.cluster import DistCoordinator

@@ -59,7 +60,9 @@ def warm_up(
    for i, data in enumerate(dataloader):
        if i > num_runs:
            break
-        inputs, labels = data[0].cuda(), data[1].cuda()
+        inputs, labels = data[0].to(get_accelerator().get_current_device()), data[1].to(
+            get_accelerator().get_current_device()
+        )
        outputs = model(inputs, labels=labels)
        loss = criterion(outputs)
        booster.backward(loss, optimizer)
@@ -85,7 +88,7 @@ def benchmark(
    warm_up_steps: int = 3,
 ):
    results = {}
-    model_device = torch.cuda.current_device()
+    model_device = get_accelerator().get_current_device()

    # Warm up
    warm_up_fn(
@@ -106,8 +109,8 @@ def benchmark(
    # Measure Allocated Memory and Throughput
    memory = {}
    throughput = {}
-    torch.cuda.reset_peak_memory_stats(device=model_device)
-    pre_mem = torch.cuda.memory_allocated(device=model_device)
+    get_accelerator().reset_peak_memory_stats(device=model_device)
+    pre_mem = get_accelerator().memory_allocated(device=model_device)

    start_time = time()

@@ -116,7 +119,9 @@ def benchmark(
            dataloader, desc=f"Epoch [{epoch + 1}/{epoch_num}]", disable=not DistCoordinator().is_master()
        ) as pbar:
            for data in pbar:
-                inputs, labels = data[0].cuda(), data[1].cuda()
+                inputs, labels = data[0].to(get_accelerator().get_current_device()), data[1].to(
+                    get_accelerator().get_current_device()
+                )
                outputs = model(inputs, labels=labels)
                loss = criterion(outputs)
                booster.backward(loss, optimizer)
@@ -128,8 +133,8 @@ def benchmark(

    all_sample = epoch_num * len(dataloader)

-    post_mem = torch.cuda.memory_allocated(device=model_device)
-    max_mem = torch.cuda.max_memory_allocated(device=model_device)
+    post_mem = get_accelerator().memory_allocated(device=model_device)
+    max_mem = get_accelerator().max_memory_allocated(device=model_device)

    memory[f"batch_size_{batch_size}"] = {
        "cuda_pre_training_bytes": format_num(pre_mem, bytes=True),
--- a/examples/language/bert/finetune.py
+++ b/examples/language/bert/finetune.py
@@ -38,7 +38,7 @@ criterion = lambda x: x.loss


 def move_to_cuda(batch):
-    return {k: v.cuda() for k, v in batch.items()}
+    return {k: v.to(get_accelerator().get_current_device()) for k, v in batch.items()}


@torch.no_grad()
@@ -266,7 +266,8 @@ def main():
    cfg = AutoConfig.from_pretrained(model_name, num_labels=data_builder.num_labels)

    if model_name == "bert-base-uncased":
-        model = BertForSequenceClassification.from_pretrained(model_name, config=cfg).cuda()
+        model = BertForSequenceClassification.from_pretrained(model_name, config=cfg)
+        model = model.to(get_accelerator().get_current_device())
    elif model_name == "albert-xxlarge-v2":
        model = AlbertForSequenceClassification.from_pretrained(model_name, config=cfg)
    else:
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -154,7 +154,7 @@ def main():
            offload_param_frac=args.offload_param_frac,
            tp_size=args.tp,
            extra_dp_size=args.extra_dp,
-            enable_fused_normalization=torch.cuda.is_available(),
+            enable_fused_normalization=get_accelerator().is_available(),
            enable_flash_attention=args.xformers,
            max_prefetch=args.prefetch_num,
            enable_async_reduce=not args.disable_async_reduce,
@@ -168,7 +168,7 @@ def main():
            warmup_non_model_data_ratio=args.warmup_ratio,
            tp_size=args.tp,
            extra_dp_size=args.extra_dp,
-            enable_fused_normalization=torch.cuda.is_available(),
+            enable_fused_normalization=get_accelerator().is_available(),
            max_prefetch=args.prefetch_num,
            enable_async_reduce=not args.disable_async_reduce,
            enable_flash_attention=args.xformers,
@@ -245,7 +245,7 @@ def main():
            sp_size=args.sp,
            sequence_parallelism_mode=args.sp_mode,
            enable_sequence_parallelism=args.sp > 1,
-            enable_fused_normalization=torch.cuda.is_available(),
+            enable_fused_normalization=get_accelerator().is_available(),
            enable_flash_attention=args.xformers,
            microbatch_size=args.mbs,
            precision="bf16",
@@ -264,7 +264,7 @@ def main():
            num_model_chunks=args.n_chunks,
            zero_stage=args.zero,
            cpu_offload=True,
-            enable_fused_normalization=torch.cuda.is_available(),
+            enable_fused_normalization=get_accelerator().is_available(),
            enable_flash_attention=args.xformers,
            microbatch_size=args.mbs,
            initial_scale=2**8,
@@ -287,8 +287,8 @@ def main():
        config = MODEL_CONFIGS[args.config]
    else:
        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)
+    get_accelerator().manual_seed(42)

-    torch.cuda.manual_seed(42)
    dataset = RandomDataset(
        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
    )
@@ -311,7 +311,6 @@ def main():
            config,
            trust_remote_code=True,
            **init_kwargs,
-            attn_implementation="flash_attention_2",
            torch_dtype=torch.bfloat16,
        )
    if args.grad_checkpoint:
@@ -321,9 +320,13 @@ def main():

    model_numel = get_model_numel(model)
    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    if config.model_type == "chatglm":
+        num_layers = model.config.num_layers
+    else:
+        num_layers = model.config.num_hidden_layers
    performance_evaluator = PerformanceEvaluator(
        model_numel,
-        model.config.num_hidden_layers,
+        num_layers,
        model.config.hidden_size,
        model.config.vocab_size,
        args.grad_checkpoint,
@@ -337,7 +340,7 @@ def main():

    torch.set_default_dtype(torch.float)
    coordinator.print_on_master(
-        f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
+        f"Booster init max device memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
    )
    coordinator.print_on_master(
        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
@@ -389,7 +392,7 @@ def main():
                performance_evaluator.on_step_end(**batch)
                prof.step()
    performance_evaluator.on_fit_end()
-    coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")
+    coordinator.print_on_master(f"Max device memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")


 if __name__ == "__main__":