[buider] use builder() for cpu adam and fused optim in setup.py (#2187)

2025-09-01 01:06:00 +00:00 · 2022-12-23 16:05:13 +08:00
parent d42afd30f8
commit bc0e271e71
6 changed files with 50 additions and 45 deletions
--- a/colossalai/kernel/op_builder/cpu_adam.py
+++ b/colossalai/kernel/op_builder/cpu_adam.py
@@ -1,8 +1,7 @@
 import os
-import sys
-from pathlib import Path

 from .builder import Builder
+from .utils import append_nvcc_threads


 class CPUAdamBuilder(Builder):
@@ -28,37 +27,35 @@ class CPUAdamBuilder(Builder):
        ]

    def include_paths(self):
-        import torch
        from torch.utils.cpp_extension import CUDA_HOME
        cuda_include = os.path.join(CUDA_HOME, "include")
        return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), cuda_include]

-    def colossalai_src_path(self, code_path):
-        if os.path.isabs(code_path):
-            return code_path
-        else:
-            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
-
    def strip_empty_entries(self, args):
        '''
        Drop any empty strings from the list of compile and link flags
        '''
        return [x for x in args if len(x) > 0]

-    def builder(self):
+    def builder(self, name) -> 'CUDAExtension':
+        """
+        get a CUDAExtension instance used for setup.py
+        """
        from torch.utils.cpp_extension import CUDAExtension
+
        return CUDAExtension(
-            name=self.name,
+            name=name,
            sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in self.sources],
            include_dirs=self.extra_include_paths,
            extra_compile_args={
-                'cxx': ['-O3'] + self.version_dependent_macros + self.extra_cxx_flags,
-                'nvcc': ['-O3', '--use_fast_math'] + self.extra_cuda_flags
+                'cxx': ['-O3'] + self.version_dependent_macros + self.extra_cuda_flags,
+                'nvcc':
+                    append_nvcc_threads(['-O3', '--use_fast_math'] + self.version_dependent_macros +
+                                        self.extra_cuda_flags)
            })

    def load(self, verbose=True):
        """
-
        load and compile cpu_adam lib at runtime

        Args:
--- a/colossalai/kernel/op_builder/fused_optim.py
+++ b/colossalai/kernel/op_builder/fused_optim.py
@@ -7,7 +7,7 @@ from .builder import Builder


 class FusedOptimBuilder(Builder):
-    NAME = "fused_optim"
+    NAME = 'fused_optim'
    BASE_DIR = "cuda_native/csrc"

    def __init__(self):
@@ -41,10 +41,10 @@ class FusedOptimBuilder(Builder):
        cuda_include = os.path.join(CUDA_HOME, "include")
        return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), cuda_include]

-    def builder(self):
+    def builder(self, name):
        from torch.utils.cpp_extension import CUDAExtension
        return CUDAExtension(
-            name=self.name,
+            name=name,
            sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in self.sources],
            include_dirs=self.extra_include_paths,
            extra_compile_args={
--- a/colossalai/kernel/op_builder/utils.py
+++ b/colossalai/kernel/op_builder/utils.py
@@ -0,0 +1,20 @@
+import subprocess
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+def append_nvcc_threads(nvcc_extra_args):
+    from torch.utils.cpp_extension import CUDA_HOME
+    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
+    if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
+        return nvcc_extra_args + ["--threads", "4"]
+    return nvcc_extra_args