llms: add mps support

2025-08-01 16:18:27 +00:00 · 2023-05-21 14:48:54 +08:00 · 2023-05-21 14:48:54 +08:00 · ce72820085
commit ce72820085
parent 7b454d8867
4 changed files with 181 additions and 15 deletions
--- a/pilot/configs/model_config.py
+++ b/pilot/configs/model_config.py
@ -16,7 +16,7 @@ DATA_DIR = os.path.join(PILOT_PATH, "data")

 nltk.data.path = [os.path.join(PILOT_PATH, "nltk_data")] + nltk.data.path

-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 LLM_MODEL_CONFIG = {
    "flan-t5-base": os.path.join(MODEL_PATH, "flan-t5-base"),
    "vicuna-13b": os.path.join(MODEL_PATH, "vicuna-13b"),
--- a/pilot/model/llm/monkey_patch.py
+++ b/pilot/model/llm/monkey_patch.py
@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+import transformers
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2].clone()
+    x2 = x[..., x.shape[-1] // 2 :].clone()
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
+        self.head_dim
+    )
+
+    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+        attn_weights = torch.max(
+            attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+        )
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+        query_states.dtype
+    )
+    attn_output = torch.matmul(attn_weights, value_states)
+
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+def replace_llama_attn_with_non_inplace_operations():
+    """Avoid bugs in mps backend by not using in-place operations."""
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
+
+import transformers
+
+
+
+def replace_llama_attn_with_non_inplace_operations():
+    """Avoid bugs in mps backend by not using in-place operations."""
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
--- a/pilot/model/loader.py
+++ b/pilot/model/loader.py
@ -2,11 +2,39 @@
 # -*- coding: utf-8 -*-

 import torch
+import sys
 import warnings
 from pilot.singleton import Singleton
-
+from typing import Optional
 from pilot.model.compression import compress_module 
 from pilot.model.adapter import get_llm_model_adapter
+from pilot.utils import get_gpu_memory
+from pilot.model.llm.monkey_patch import replace_llama_attn_with_non_inplace_operations
+
+def raise_warning_for_incompatible_cpu_offloading_configuration(
+    device: str, load_8bit: bool, cpu_offloading: bool
+):
+    if cpu_offloading:
+        if not load_8bit:
+            warnings.warn(
+                "The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
+                "Use '--load-8bit' to enable 8-bit-quantization\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if not "linux" in sys.platform:
+            warnings.warn(
+                "CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if device != "cuda":
+            warnings.warn(
+                "CPU-offloading is only enabled when using CUDA-devices\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+    return cpu_offloading


 class ModelLoader(metaclass=Singleton):
@ -30,26 +58,39 @@ class ModelLoader(metaclass=Singleton):
        }

    # TODO multi gpu support
-    def loader(self, num_gpus, load_8bit=False, debug=False):
+    def loader(self, num_gpus, load_8bit=False, debug=False, cpu_offloading=False, max_gpu_memory: Optional[str]=None):
+
+        cpu_offloading(self.device, load_8bit, cpu_offloading)
+        
        if self.device == "cpu":
-            kwargs = {}
+            kwargs = {"torch_dtype": torch.float32}

        elif self.device == "cuda":
            kwargs = {"torch_dtype": torch.float16}
-            if num_gpus == "auto":
+            num_gpus = int(num_gpus)
+
+            if num_gpus != 1:
                kwargs["device_map"] = "auto"
+                if max_gpu_memory is None:
+                    kwargs["device_map"] = "sequential"
+                
+                available_gpu_memory = get_gpu_memory(num_gpus)
+                kwargs["max_memory"] = {
+                    i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
+                    for i in range(num_gpus)
+                }
+
            else:
-                num_gpus = int(num_gpus)
-                if num_gpus != 1:
-                    kwargs.update({
-                        "device_map": "auto",
-                        "max_memory": {i: "13GiB" for i in range(num_gpus)},
-                    })
+                kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
+
+        elif self.device == "mps":
+            kwargs = kwargs = {"torch_dtype": torch.float16}
+            replace_llama_attn_with_non_inplace_operations()
        else:
-            # Todo Support mps for practise
            raise ValueError(f"Invalid device: {self.device}")

-        
+        # TODO when cpu loading,  need use quantization config
+
        llm_adapter = get_llm_model_adapter(self.model_path)
        model, tokenizer = llm_adapter.loader(self.model_path, kwargs)

@ -61,7 +102,7 @@ class ModelLoader(metaclass=Singleton):
            else:
                compress_module(model, self.device) 

-        if (self.device == "cuda" and num_gpus == 1):
+        if (self.device == "cuda" and num_gpus == 1 and not cpu_offloading) or self.device == "mps":
            model.to(self.device)

        if debug:
--- a/pilot/server/llmserver.py
+++ b/pilot/server/llmserver.py
@ -153,7 +153,7 @@ def embeddings(prompt_request: EmbeddingRequest):
 if __name__ == "__main__":

    model_path = LLM_MODEL_CONFIG[CFG.LLM_MODEL]
-    print(model_path)
+    print(model_path, DEVICE)
    
    worker = ModelWorker(
        model_path=model_path,