[npu] add npu support for gemini and zero (#5067)

* [npu] setup device utils (#5047) * [npu] add npu device support * [npu] support low level zero * [test] update npu zero plugin test * [hotfix] fix import * [test] recover tests * [npu] gemini support npu (#5052) * [npu] refactor device utils * [gemini] support npu * [example] llama2+gemini support npu * [kernel] add arm cpu adam kernel (#5065) * [kernel] add arm cpu adam * [optim] update adam optimizer * [kernel] arm cpu adam remove bf16 support
2026-01-29 21:49:54 +00:00 · 2023-11-20 16:12:41 +08:00
parent 8d56c9c389
commit e5ce4c8ea6
46 changed files with 994 additions and 233 deletions
--- a/colossalai/utils/init.py
+++ b/colossalai/utils/init.py
@@ -7,7 +7,7 @@ from .common import (
    is_ddp_ignored,
    set_seed,
 )
-from .cuda import empty_cache, get_current_device, set_device, set_to_cuda, synchronize
+from .device import IS_NPU_AVAILABLE, empty_cache, get_current_device, set_device, set_to_cuda, synchronize
 from .multi_tensor_apply import multi_tensor_applier
 from .tensor_detector import TensorDetector
 from .timer import MultiTimer, Timer
@@ -29,4 +29,5 @@ __all__ = [
    "set_seed",
    "is_ddp_ignored",
    "set_device",
+    "IS_NPU_AVAILABLE",
 ]
--- a/colossalai/utils/cuda.py
+++ b/colossalai/utils/cuda.py
@@ -1,56 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from typing import Optional
-
-import torch
-import torch.distributed as dist
-
-
-def set_to_cuda(models):
-    """Send model to gpu.
-
-    :param models: nn.module or a list of module
-    """
-    if isinstance(models, list) and len(models) > 1:
-        ret = []
-        for model in models:
-            ret.append(model.to(get_current_device()))
-        return ret
-    elif isinstance(models, list):
-        return models[0].to(get_current_device())
-    else:
-        return models.to(get_current_device())
-
-
-def get_current_device() -> torch.device:
-    """
-    Returns currently selected device (gpu/cpu).
-    If cuda available, return gpu, otherwise return cpu.
-    """
-    if torch.cuda.is_available():
-        return torch.device(f"cuda:{torch.cuda.current_device()}")
-    else:
-        return torch.device("cpu")
-
-
-def synchronize():
-    """Similar to cuda.synchronize().
-    Waits for all kernels in all streams on a CUDA device to complete.
-    """
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-
-
-def empty_cache():
-    """Similar to cuda.empty_cache()
-    Releases all unoccupied cached memory currently held by the caching allocator.
-    """
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-
-
-def set_device(index: Optional[int] = None) -> None:
-    if index is None:
-        index = dist.get_rank() % torch.cuda.device_count()
-    torch.cuda.set_device(index)
--- a/colossalai/utils/device.py
+++ b/colossalai/utils/device.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+
+IS_NPU_AVAILABLE: bool = False
+try:
+    import torch_npu  # noqa
+
+    IS_NPU_AVAILABLE = torch.npu.is_available()
+except ImportError:
+    pass
+
+
+def set_to_cuda(models):
+    """Send model to gpu.
+
+    :param models: nn.module or a list of module
+    """
+    if isinstance(models, list) and len(models) > 1:
+        ret = []
+        for model in models:
+            ret.append(model.to(get_current_device()))
+        return ret
+    elif isinstance(models, list):
+        return models[0].to(get_current_device())
+    else:
+        return models.to(get_current_device())
+
+
+def get_current_device() -> torch.device:
+    """
+    Returns currently selected device (gpu/cpu).
+    If cuda available, return gpu, otherwise return cpu.
+    """
+    if torch.cuda.is_available():
+        return torch.device(f"cuda:{torch.cuda.current_device()}")
+    elif IS_NPU_AVAILABLE:
+        return torch.device(f"npu:{torch.npu.current_device()}")
+    else:
+        return torch.device("cpu")
+
+
+def _dispatch_device_func(fn_name: str, *args, **kwargs):
+    if torch.cuda.is_available():
+        return getattr(torch.cuda, fn_name)(*args, **kwargs)
+    elif IS_NPU_AVAILABLE:
+        return getattr(torch.npu, fn_name)(*args, **kwargs)
+    else:
+        raise RuntimeError("No device available")
+
+
+# device semantics
+
+
+def can_device_access_peer(device, peer_device) -> bool:
+    return _dispatch_device_func("can_device_access_peer", device, peer_device)
+
+
+def current_device() -> int:
+    return _dispatch_device_func("current_device")
+
+
+def current_stream(device=None):
+    return _dispatch_device_func("current_stream", device)
+
+
+def default_stream(device=None):
+    return _dispatch_device_func("default_stream", device)
+
+
+def device_count() -> int:
+    return _dispatch_device_func("device_count")
+
+
+def get_device_capability(device=None) -> Tuple[int, int]:
+    return _dispatch_device_func("get_device_capability", device)
+
+
+def get_device_name(device=None) -> str:
+    return _dispatch_device_func("get_device_name", device)
+
+
+def get_device_properties(device):
+    return _dispatch_device_func("get_device_properties", device)
+
+
+def set_device(index: Optional[int] = None) -> None:
+    if index is None:
+        index = dist.get_rank() % device_count()
+    _dispatch_device_func("set_device", index)
+
+
+def set_stream(stream_):
+    return _dispatch_device_func("set_stream", stream_)
+
+
+def stream(stream_):
+    return _dispatch_device_func("stream", stream_)
+
+
+def synchronize():
+    return _dispatch_device_func("synchronize")
+
+
+def utilization(device=None) -> int:
+    return _dispatch_device_func("utilization", device)
+
+
+# random number generator
+
+
+def get_rng_state(device="cuda") -> torch.Tensor:
+    return _dispatch_device_func("get_rng_state", device)
+
+
+def get_rng_state_all() -> List[torch.Tensor]:
+    return _dispatch_device_func("get_rng_state_all")
+
+
+def set_rng_state(new_state: torch.ByteTensor, device="cuda") -> None:
+    return _dispatch_device_func("set_rng_state", new_state, device)
+
+
+def set_rng_state_all(new_states: List[torch.ByteTensor]) -> None:
+    return _dispatch_device_func("set_rng_state_all", new_states)
+
+
+def manual_seed(seed: int) -> None:
+    return _dispatch_device_func("manual_seed", seed)
+
+
+def manual_seed_all(seed: int) -> None:
+    return _dispatch_device_func("manual_seed_all", seed)
+
+
+def seed() -> None:
+    return _dispatch_device_func("seed")
+
+
+def seed_all() -> None:
+    return _dispatch_device_func("seed_all")
+
+
+def initial_seed() -> int:
+    return _dispatch_device_func("initial_seed")
+
+
+# streams and events
+
+
+def Stream(device=None, priority=0, **kwargs):
+    return _dispatch_device_func("Stream", device, priority, **kwargs)
+
+
+def Event(enable_timing: bool = False, blocking: bool = False, interprocess: bool = False):
+    return _dispatch_device_func("Event", enable_timing, blocking, interprocess)
+
+
+# memory management
+
+
+def empty_cache() -> None:
+    return _dispatch_device_func("empty_cache")
+
+
+def memory_stats(device=None) -> Dict[str, Any]:
+    return _dispatch_device_func("memory_stats", device)
+
+
+def memory_summary(device=None, abbreviated=False) -> str:
+    return _dispatch_device_func("memory_summary", device, abbreviated)
+
+
+def memory_snapshot():
+    return _dispatch_device_func("memory_snapshot")
+
+
+def memory_allocated(device=None) -> int:
+    return _dispatch_device_func("memory_allocated", device)
+
+
+def max_memory_allocated(device=None) -> int:
+    return _dispatch_device_func("max_memory_allocated", device)
+
+
+def reset_max_memory_allocated(device=None) -> None:
+    return _dispatch_device_func("reset_max_memory_allocated", device)
+
+
+def memory_reserved(device=None) -> int:
+    return _dispatch_device_func("memory_reserved", device)
+
+
+def max_memory_reserved(device=None) -> int:
+    return _dispatch_device_func("max_memory_reserved", device)
+
+
+def set_per_process_memory_fraction(fraction: float, device=None) -> None:
+    return _dispatch_device_func("set_per_process_memory_fraction", fraction, device)
+
+
+def reset_peak_memory_stats(device=None) -> None:
+    return _dispatch_device_func("reset_peak_memory_stats", device)
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@@ -3,7 +3,7 @@
 import time
 from typing import Tuple

-from .cuda import synchronize
+from .device import synchronize


 class Timer: