[npu] add npu support for gemini and zero (#5067)

* [npu] setup device utils (#5047) * [npu] add npu device support * [npu] support low level zero * [test] update npu zero plugin test * [hotfix] fix import * [test] recover tests * [npu] gemini support npu (#5052) * [npu] refactor device utils * [gemini] support npu * [example] llama2+gemini support npu * [kernel] add arm cpu adam kernel (#5065) * [kernel] add arm cpu adam * [optim] update adam optimizer * [kernel] arm cpu adam remove bf16 support
2025-09-24 03:03:37 +00:00 · 2023-11-20 16:12:41 +08:00
parent 8d56c9c389
commit e5ce4c8ea6
46 changed files with 994 additions and 233 deletions
--- a/op_builder/init.py
+++ b/op_builder/init.py
@@ -1,3 +1,4 @@
+from .arm_cpu_adam import ArmCPUAdamBuilder
 from .cpu_adam import CPUAdamBuilder
 from .fused_optim import FusedOptimBuilder
 from .layernorm import LayerNormBuilder
@@ -29,4 +30,5 @@ __all__ = [
    "MultiTensorLambBuilder",
    "MultiTensorScaleBuilder",
    "MultiTensorL2NormBuilder",
+    "ArmCPUAdamBuilder",
 ]
--- a/op_builder/arm_cpu_adam.py
+++ b/op_builder/arm_cpu_adam.py
@@ -0,0 +1,34 @@
+from .builder import Builder
+
+
+class ArmCPUAdamBuilder(Builder):
+    NAME = "arm_cpu_adam"
+    PREBUILT_IMPORT_PATH = "colossalai._C.arm_cpu_adam"
+    ext_type = "cpu"
+
+    def __init__(self):
+        super().__init__(name=ArmCPUAdamBuilder.NAME, prebuilt_import_path=ArmCPUAdamBuilder.PREBUILT_IMPORT_PATH)
+        self.version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
+
+    # necessary 4 functions
+    def sources_files(self):
+        ret = [
+            self.csrc_abs_path("cpu_adam_arm.cpp"),
+        ]
+        return ret
+
+    def include_dirs(self):
+        return [self.csrc_abs_path("includes")]
+
+    def cxx_flags(self):
+        extra_cxx_flags = [
+            "-std=c++14",
+            "-std=c++17",
+            "-g",
+            "-Wno-reorder",
+            "-fopenmp",
+        ]
+        return ["-O3"] + self.version_dependent_macros + extra_cxx_flags
+
+    def nvcc_flags(self):
+        return []
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -7,7 +7,7 @@ import os
 import time
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Union

 from .utils import check_cuda_availability, check_system_pytorch_cuda_match, print_rank_0

@@ -21,6 +21,8 @@ class Builder(ABC):
        prebuilt_import_path (str): the path where the extension is installed during pip install
    """

+    ext_type: str = "cuda"
+
    def __init__(self, name: str, prebuilt_import_path: str):
        self.name = name
        self.prebuilt_import_path = prebuilt_import_path
@@ -165,7 +167,8 @@ class Builder(ABC):
                )
        except ImportError:
            # check environment
-            self.check_runtime_build_environment()
+            if self.ext_type == "cuda":
+                self.check_runtime_build_environment()

            # time the kernel compilation
            start_build = time.time()
@@ -208,11 +211,19 @@ class Builder(ABC):

        return op_module

-    def builder(self) -> "CUDAExtension":
+    def builder(self) -> Union["CUDAExtension", "CppExtension"]:
        """
        get a CUDAExtension instance used for setup.py
        """
-        from torch.utils.cpp_extension import CUDAExtension
+        from torch.utils.cpp_extension import CppExtension, CUDAExtension
+
+        if self.ext_type == "cpp":
+            return CppExtension(
+                name=self.prebuilt_import_path,
+                sources=self.strip_empty_entries(self.sources_files()),
+                include_dirs=self.strip_empty_entries(self.include_dirs()),
+                extra_compile_args=self.strip_empty_entries(self.cxx_flags()),
+            )

        return CUDAExtension(
            name=self.prebuilt_import_path,