[npu] add npu support for gemini and zero (#5067)

* [npu] setup device utils (#5047) * [npu] add npu device support * [npu] support low level zero * [test] update npu zero plugin test * [hotfix] fix import * [test] recover tests * [npu] gemini support npu (#5052) * [npu] refactor device utils * [gemini] support npu * [example] llama2+gemini support npu * [kernel] add arm cpu adam kernel (#5065) * [kernel] add arm cpu adam * [optim] update adam optimizer * [kernel] arm cpu adam remove bf16 support
2025-09-04 10:34:41 +00:00 · 2023-11-20 16:12:41 +08:00
parent 8d56c9c389
commit e5ce4c8ea6
46 changed files with 994 additions and 233 deletions
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -7,6 +7,7 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup

 from colossalai.utils import get_current_device
+from colossalai.utils.device import IS_NPU_AVAILABLE


 class TensorState(Enum):
@@ -172,7 +173,7 @@ class Chunk:

        if self.chunk_temp is not None:
            # this chunk is not closed
-            if self.chunk_temp.device.type == "cuda":
+            if self.chunk_temp.device.type == "cuda" or self.chunk_temp.device.type == "npu":
                cuda_memory += self.chunk_mem
            else:
                cpu_memory += self.chunk_mem
@@ -191,10 +192,8 @@ class Chunk:
        if self.chunk_temp is not None:
            return self.chunk_temp.device.type
        else:
-            if self.is_gathered:
-                return "cuda"
-            elif self.cuda_shard is not None:
-                return "cuda"
+            if self.is_gathered or self.cuda_shard is not None:
+                return "npu" if IS_NPU_AVAILABLE else "cuda"
            else:
                return "cpu"

@@ -329,12 +328,12 @@ class Chunk:
        # when the current chunk is not synchronized with the optimizer
        # just use another way for the movement
        if not self.optim_sync_flag:
-            assert device.type == "cuda", "each chunk should first be moved to CUDA"
+            assert device.type == "cuda" or device.type == "npu", "each chunk should first be moved to CUDA"
            self.__paired_shard_move()
            self.optim_sync_flag = True
            return

-        if device.type == "cuda":
+        if device.type == "cuda" or device.type == "npu":
            assert device == get_current_device(), "can't move chunk to another device"

            if self.cuda_shard:
@@ -484,7 +483,7 @@ class Chunk:
            assert friend_chunk.is_gathered is True
            self.cuda_global_chunk.copy_(friend_chunk.cuda_global_chunk)
            self.optim_sync_flag = True
-        elif friend_chunk.device_type == "cuda" and self.device_type == "cuda":
+        elif friend_chunk.device_type in ("cuda", "npu") and self.device_type in ("cuda", "npu"):
            self.cuda_shard.copy_(friend_chunk.cuda_shard)
            self.optim_sync_flag = True
            self.cpu_vis_flag = False