mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-09 13:00:52 +00:00
[moe] merge moe into main (#4978)
* update moe module * support openmoe
This commit is contained in:
0
examples/language/openmoe/model/__init__.py
Normal file
0
examples/language/openmoe/model/__init__.py
Normal file
224
examples/language/openmoe/model/convert_openmoe_ckpt.py
Normal file
224
examples/language/openmoe/model/convert_openmoe_ckpt.py
Normal file
@@ -0,0 +1,224 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 Google LLC and HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Convert T5X checkpoint to PyTorch
|
||||
|
||||
Steps:
|
||||
- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
|
||||
- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
|
||||
`gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
|
||||
- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
|
||||
https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
|
||||
- Convert:
|
||||
```
|
||||
python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
|
||||
--pytorch_dump_path=$HOME/t5_1_1_small_pt
|
||||
```
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
|
||||
import torch
|
||||
from flax import traverse_util
|
||||
from modeling_openmoe import OpenMoeForCausalLM
|
||||
from t5x import checkpoints
|
||||
from transformers import LlamaConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logging.set_verbosity_info()
|
||||
|
||||
|
||||
def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
|
||||
"""Returns the KOQV parameters of (self-)attention. Does not transpose."""
|
||||
k = params[f"{prefix}/layers_{i}/{layer_name}/key/kernel"]
|
||||
o = params[f"{prefix}/layers_{i}/{layer_name}/out/kernel"]
|
||||
q = params[f"{prefix}/layers_{i}/{layer_name}/query/kernel"]
|
||||
v = params[f"{prefix}/layers_{i}/{layer_name}/value/kernel"]
|
||||
return k, o, q, v
|
||||
|
||||
|
||||
def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
|
||||
"""Returns the MLP parameters of a layer. Does not transpose."""
|
||||
if split_mlp_wi:
|
||||
wi_0 = params[f"{prefix}/layers_{i}/mlp/wi_0/kernel"]
|
||||
wi_1 = params[f"{prefix}/layers_{i}/mlp/wi_1/kernel"]
|
||||
wi = (wi_0, wi_1)
|
||||
else:
|
||||
wi = params[f"{prefix}/layers_{i}/mlp/wi/kernel"]
|
||||
|
||||
wo = params[f"{prefix}/layers_{i}/mlp/wo/kernel"]
|
||||
return wi, wo
|
||||
|
||||
|
||||
def t5x_extra_mlp_lookup(params, i, prefix, split_mlp_wi=False):
|
||||
"""Returns the MLP parameters of a layer. Does not transpose."""
|
||||
if split_mlp_wi:
|
||||
wi_0 = params[f"{prefix}/layers_{i}/extra_mlp/wi_0/kernel"]
|
||||
wi_1 = params[f"{prefix}/layers_{i}/extra_mlp/wi_1/kernel"]
|
||||
wi = (wi_0, wi_1)
|
||||
else:
|
||||
wi = params[f"{prefix}/layers_{i}/extra_mlp/wi/kernel"]
|
||||
|
||||
wo = params[f"{prefix}/layers_{i}/extra_mlp/wo/kernel"]
|
||||
return wi, wo
|
||||
|
||||
|
||||
def t5x_experts_lookup(params, i, prefix, split_mlp_wi=False):
|
||||
"""Returns the MLP parameters of a layer. Does not transpose."""
|
||||
if split_mlp_wi:
|
||||
wi_0 = params[f"{prefix}/layers_{i}/mlp/expert/wi_0/kernel"]
|
||||
wi_1 = params[f"{prefix}/layers_{i}/mlp/expert/wi_1/kernel"]
|
||||
wi = (wi_0, wi_1)
|
||||
else:
|
||||
wi = params[f"{prefix}/layers_{i}/mlp/expert/wi/kernel"]
|
||||
|
||||
wo = params[f"{prefix}/layers_{i}/mlp/expert/wo/kernel"]
|
||||
return wi, wo
|
||||
|
||||
|
||||
def t5x_gate_lookup(params, i, prefix, split_mlp_wi=False):
|
||||
"""Returns the MLP parameters of a layer. Does not transpose."""
|
||||
return params[f"{prefix}/layers_{i}/mlp/router/router_weights/w/kernel"]
|
||||
|
||||
|
||||
def t5x_layer_norm_lookup(params, i, prefix, layer_name):
|
||||
"""Returns the layer norm param of a layer."""
|
||||
return params[f"{prefix}/layers_{i}/{layer_name}/scale"]
|
||||
|
||||
|
||||
def convert_t5x_to_pytorch(variables: dict, *, num_layers: int, moe_interval: int):
|
||||
"""Converts the parameters from T5X-Flax to Transformers-PyTorch."""
|
||||
old = traverse_util.flatten_dict(variables["target"])
|
||||
old = {"/".join(k): v for k, v in old.items()}
|
||||
|
||||
# v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
|
||||
split_mlp_wi = True
|
||||
print("Split MLP:", split_mlp_wi)
|
||||
|
||||
new = collections.OrderedDict()
|
||||
print(old.keys())
|
||||
for key, value in old.items():
|
||||
print(f"{key}: {value.shape}")
|
||||
|
||||
# Shared embeddings.
|
||||
new["model.embed_tokens.weight"] = old["token_embedder/embedding"]
|
||||
|
||||
# Decoder.
|
||||
for i in range(num_layers):
|
||||
# Block i, layer 0 (Self Attention).
|
||||
layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
|
||||
k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
|
||||
new[f"model.layers.{i}.input_layernorm.weight"] = layer_norm
|
||||
new[f"model.layers.{i}.self_attn.k_proj.weight"] = k.T
|
||||
new[f"model.layers.{i}.self_attn.o_proj.weight"] = o.T
|
||||
new[f"model.layers.{i}.self_attn.q_proj.weight"] = q.T
|
||||
new[f"model.layers.{i}.self_attn.v_proj.weight"] = v.T
|
||||
|
||||
# Block i, layer 2 (MLP).
|
||||
layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
|
||||
new[f"model.layers.{i}.post_attention_layernorm.weight"] = layer_norm
|
||||
|
||||
if (i + 1) % moe_interval == 0:
|
||||
# moe
|
||||
gate = t5x_gate_lookup(old, i, "decoder", split_mlp_wi)
|
||||
new[f"model.layers.{i}.mlp.gate_weight"] = gate.T
|
||||
wi, wo = t5x_experts_lookup(old, i, "decoder", split_mlp_wi)
|
||||
new[f"model.layers.{i}.mlp.experts.wi_gate"] = wi[0]
|
||||
new[f"model.layers.{i}.mlp.experts.wi_up"] = wi[1]
|
||||
new[f"model.layers.{i}.mlp.experts.wo"] = wo
|
||||
# extra
|
||||
layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_extra_mlp_layer_norm")
|
||||
new[f"model.layers.{i}.pre_extra_mlp_layernorm.weight"] = layer_norm
|
||||
wi, wo = t5x_extra_mlp_lookup(old, i, "decoder", split_mlp_wi)
|
||||
new[f"model.layers.{i}.extra_mlp.gate_proj.weight"] = wi[0].T
|
||||
new[f"model.layers.{i}.extra_mlp.up_proj.weight"] = wi[1].T
|
||||
new[f"model.layers.{i}.extra_mlp.down_proj.weight"] = wo.T
|
||||
else:
|
||||
wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
|
||||
new[f"model.layers.{i}.mlp.gate_proj.weight"] = wi[0].T
|
||||
new[f"model.layers.{i}.mlp.up_proj.weight"] = wi[1].T
|
||||
new[f"model.layers.{i}.mlp.down_proj.weight"] = wo.T
|
||||
|
||||
new["model.norm.weight"] = old["decoder/decoder_norm/scale"]
|
||||
|
||||
# LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
|
||||
if "decoder/logits_dense/kernel" in old:
|
||||
new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
|
||||
|
||||
return new
|
||||
|
||||
|
||||
def make_state_dict(converted_params):
|
||||
"""Prepares a state dict for the PyTorch model."""
|
||||
# Make a state dict with torch tensors.
|
||||
state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
|
||||
|
||||
return state_dict
|
||||
|
||||
|
||||
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path):
|
||||
"""Replaces the params in model witht the T5X converted params."""
|
||||
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
|
||||
converted = convert_t5x_to_pytorch(variables,
|
||||
num_layers=config.num_hidden_layers,
|
||||
moe_interval=config.moe_layer_interval)
|
||||
state_dict = make_state_dict(converted)
|
||||
model.load_state_dict(state_dict, strict=True)
|
||||
|
||||
|
||||
def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_dump_path):
|
||||
"""Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
|
||||
# Initialise PyTorch model
|
||||
config = LlamaConfig.from_json_file(config_file)
|
||||
print(f"Building PyTorch model from configuration: {config}")
|
||||
# Non-v1.1 checkpoints could also use T5Model, but this works for all.
|
||||
# The v1.0 checkpoints will simply have an LM head that is the word embeddings.
|
||||
model = OpenMoeForCausalLM(config)
|
||||
|
||||
# Load weights from tf checkpoint
|
||||
load_t5x_weights_in_t5(model, config, t5x_checkpoint_path)
|
||||
|
||||
# Save pytorch-model
|
||||
print(f"Save PyTorch model to {pytorch_dump_path}")
|
||||
model.save_pretrained(pytorch_dump_path)
|
||||
|
||||
# Verify that we can load the checkpoint.
|
||||
model.from_pretrained(pytorch_dump_path)
|
||||
print("Done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
|
||||
# Required parameters
|
||||
parser.add_argument("--t5x_checkpoint_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the T5X checkpoint.")
|
||||
parser.add_argument(
|
||||
"--config_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
|
||||
)
|
||||
parser.add_argument("--pytorch_dump_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the output PyTorch model.")
|
||||
args = parser.parse_args()
|
||||
convert_t5x_checkpoint_to_pytorch(args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path)
|
1
examples/language/openmoe/model/convert_openmoe_ckpt.sh
Normal file
1
examples/language/openmoe/model/convert_openmoe_ckpt.sh
Normal file
@@ -0,0 +1 @@
|
||||
python convert_openmoe_ckpt.py --t5x_checkpoint_path /path/to/t5x --config_file /path/to/config --pytorch_dump_path /path/to/save
|
1113
examples/language/openmoe/model/modeling_openmoe.py
Normal file
1113
examples/language/openmoe/model/modeling_openmoe.py
Normal file
File diff suppressed because it is too large
Load Diff
24
examples/language/openmoe/model/openmoe_8b_config.json
Normal file
24
examples/language/openmoe/model/openmoe_8b_config.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"architectures": [
|
||||
"OpenMoeForCausalLM"
|
||||
],
|
||||
"intermediate_size": 8192,
|
||||
"hidden_size": 2048,
|
||||
"num_hidden_layers": 24,
|
||||
"head_dim": 128,
|
||||
"num_attention_heads": 24,
|
||||
"dropout_rate": 0.0,
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"vocab_size": 256384,
|
||||
"hidden_act": "swiglu",
|
||||
"num_experts": 32,
|
||||
"topk": 2,
|
||||
"capacity_factor_train": 1.25,
|
||||
"capacity_factor_eval": 2.0,
|
||||
"min_capacity": 4,
|
||||
"noisy_policy": null,
|
||||
"drop_tks": true,
|
||||
"expert_parallel": null,
|
||||
"gated": true,
|
||||
"moe_layer_interval": 6
|
||||
}
|
24
examples/language/openmoe/model/openmoe_base_config.json
Normal file
24
examples/language/openmoe/model/openmoe_base_config.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"architectures": [
|
||||
"OpenMoeForCausalLM"
|
||||
],
|
||||
"intermediate_size": 2048,
|
||||
"hidden_size": 768,
|
||||
"num_hidden_layers": 12,
|
||||
"head_dim": 64,
|
||||
"num_attention_heads": 12,
|
||||
"dropout_rate": 0.0,
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"vocab_size": 256384,
|
||||
"hidden_act": "swiglu",
|
||||
"num_experts": 16,
|
||||
"topk": 2,
|
||||
"capacity_factor_train": 1.25,
|
||||
"capacity_factor_eval": 2.0,
|
||||
"min_capacity": 4,
|
||||
"noisy_policy": null,
|
||||
"drop_tks": true,
|
||||
"expert_parallel": null,
|
||||
"gated": true,
|
||||
"moe_layer_interval": 4
|
||||
}
|
562
examples/language/openmoe/model/openmoe_policy.py
Normal file
562
examples/language/openmoe/model/openmoe_policy.py
Normal file
@@ -0,0 +1,562 @@
|
||||
import warnings
|
||||
from functools import partial
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch import Tensor
|
||||
from torch.nn import Module
|
||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||
from transformers.utils import logging
|
||||
|
||||
from colossalai.moe.manager import MOE_MANAGER
|
||||
from colossalai.pipeline.stage_manager import PipelineStageManager
|
||||
from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
|
||||
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
|
||||
|
||||
from .modeling_openmoe import OpenMoeDecoderLayer, OpenMoeForCausalLM, OpenMoeModel
|
||||
|
||||
__all__ = ["OpenMoePolicy", "OpenMoeForCausalLMPolicy"]
|
||||
|
||||
|
||||
class OpenMoePolicy(Policy):
|
||||
|
||||
def config_sanity_check(self):
|
||||
pass
|
||||
|
||||
def preprocess(self):
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
# Resize embedding
|
||||
vocab_size = self.model.config.vocab_size
|
||||
world_size = self.shard_config.tensor_parallel_size
|
||||
|
||||
if vocab_size % world_size != 0:
|
||||
new_vocab_size = vocab_size + world_size - vocab_size % world_size
|
||||
self.model.resize_token_embeddings(new_vocab_size)
|
||||
|
||||
return self.model
|
||||
|
||||
def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
|
||||
policy = {}
|
||||
|
||||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
raise NotImplementedError(
|
||||
"openmoe dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.")
|
||||
|
||||
# optimization configuration
|
||||
if self.shard_config.enable_fused_normalization:
|
||||
self.append_or_create_submodule_replacement(
|
||||
description=[
|
||||
SubModuleReplacementDescription(
|
||||
suffix="input_layernorm",
|
||||
target_module=FusedRMSNorm,
|
||||
),
|
||||
SubModuleReplacementDescription(
|
||||
suffix="post_attention_layernorm",
|
||||
target_module=FusedRMSNorm,
|
||||
),
|
||||
SubModuleReplacementDescription(
|
||||
suffix="pre_extra_mlp_layernorm",
|
||||
target_module=FusedRMSNorm,
|
||||
ignore_if_not_exist=True,
|
||||
),
|
||||
],
|
||||
policy=policy,
|
||||
target_key=OpenMoeDecoderLayer,
|
||||
)
|
||||
|
||||
self.append_or_create_submodule_replacement(
|
||||
description=SubModuleReplacementDescription(
|
||||
suffix="norm",
|
||||
target_module=FusedRMSNorm,
|
||||
),
|
||||
policy=policy,
|
||||
target_key=OpenMoeModel,
|
||||
)
|
||||
|
||||
if self.shard_config.enable_flash_attention:
|
||||
raise NotImplementedError("Flash attention has already been replaced in openmoe.")
|
||||
|
||||
return policy
|
||||
|
||||
def postprocess(self):
|
||||
return self.model
|
||||
|
||||
def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
|
||||
"""If under pipeline parallel setting, replacing the original forward method of huggingface
|
||||
to customized forward method, and add this changing to policy."""
|
||||
if self.pipeline_stage_manager:
|
||||
stage_manager = self.pipeline_stage_manager
|
||||
if self.model.__class__.__name__ == "OpenMoeModel":
|
||||
module = self.model
|
||||
else:
|
||||
module = self.model.model
|
||||
|
||||
layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
|
||||
stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
|
||||
method_replacement = {"forward": partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
|
||||
self.append_or_create_method_replacement(description=method_replacement,
|
||||
policy=policy,
|
||||
target_key=model_cls)
|
||||
|
||||
return
|
||||
|
||||
def get_held_layers(self) -> List[Module]:
|
||||
"""Get pipeline layers for current stage."""
|
||||
assert self.pipeline_stage_manager is not None
|
||||
|
||||
if self.model.__class__.__name__ == "OpenMoeModel":
|
||||
module = self.model
|
||||
else:
|
||||
module = self.model.model
|
||||
stage_manager = self.pipeline_stage_manager
|
||||
|
||||
held_layers = []
|
||||
layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
|
||||
if stage_manager.is_first_stage():
|
||||
held_layers.append(module.embed_tokens)
|
||||
start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
|
||||
held_layers.extend(module.layers[start_idx:end_idx])
|
||||
if stage_manager.is_last_stage():
|
||||
held_layers.append(module.norm)
|
||||
|
||||
return held_layers
|
||||
|
||||
@staticmethod
|
||||
def distribute_layers(num_layers: int, num_stages: int) -> List[int]:
|
||||
"""Divide layers into stages
|
||||
|
||||
"""
|
||||
if num_layers == 24 and num_stages == 4:
|
||||
return [7, 7, 7, 3]
|
||||
elif num_layers == 24 and num_stages == 2:
|
||||
return [15, 9]
|
||||
elif num_layers == 12 and num_stages == 4:
|
||||
return [5, 5, 5, 1]
|
||||
elif num_layers == 12 and num_stages == 2:
|
||||
return [8, 4]
|
||||
else:
|
||||
print(f"num_layers: {num_layers}, num_stages: {num_stages} not optimized, use origin pp policy")
|
||||
return Policy.distribute_layers(num_layers, num_stages)
|
||||
|
||||
|
||||
class OpenMoeModelPolicy(OpenMoePolicy):
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def module_policy(self):
|
||||
policy = super().module_policy()
|
||||
if self.pipeline_stage_manager:
|
||||
# set None as default
|
||||
self.set_pipeline_forward(
|
||||
model_cls=OpenMoeModel,
|
||||
new_forward=OpenMoePipelineForwards.openmoe_model_forward,
|
||||
policy=policy,
|
||||
)
|
||||
return policy
|
||||
|
||||
def get_held_layers(self) -> List[Module]:
|
||||
"""Get pipeline layers for current stage."""
|
||||
held_layers = super().get_held_layers()
|
||||
return held_layers
|
||||
|
||||
def get_shared_params(self) -> List[Dict[int, Tensor]]:
|
||||
"""No shared params in llama model"""
|
||||
return []
|
||||
|
||||
|
||||
class OpenMoeForCausalLMPolicy(OpenMoePolicy):
|
||||
|
||||
def module_policy(self):
|
||||
policy = super().module_policy()
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
# add a new item for casual lm
|
||||
new_item = {
|
||||
OpenMoeForCausalLM:
|
||||
ModulePolicyDescription(sub_module_replacement=[
|
||||
SubModuleReplacementDescription(
|
||||
suffix="lm_head",
|
||||
target_module=Linear1D_Col,
|
||||
kwargs=dict(gather_output=True),
|
||||
)
|
||||
])
|
||||
}
|
||||
policy.update(new_item)
|
||||
|
||||
if self.pipeline_stage_manager:
|
||||
# set None as default
|
||||
self.set_pipeline_forward(
|
||||
model_cls=OpenMoeForCausalLM,
|
||||
new_forward=OpenMoePipelineForwards.llama_for_causal_lm_forward,
|
||||
policy=policy,
|
||||
)
|
||||
|
||||
return policy
|
||||
|
||||
def get_held_layers(self) -> List[Module]:
|
||||
"""Get pipeline layers for current stage."""
|
||||
stage_manager = self.pipeline_stage_manager
|
||||
held_layers = super().get_held_layers()
|
||||
if stage_manager.is_last_stage():
|
||||
held_layers.append(self.model.lm_head)
|
||||
return held_layers
|
||||
|
||||
def get_shared_params(self) -> List[Dict[int, Tensor]]:
|
||||
llama_model = self.model.model
|
||||
if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
|
||||
if (id(llama_model.embed_tokens.weight) == id(self.model.lm_head.weight)
|
||||
and self.pipeline_stage_manager.num_stages > 1):
|
||||
# tie weights
|
||||
return [{
|
||||
0: llama_model.embed_tokens.weight,
|
||||
self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
|
||||
}]
|
||||
return []
|
||||
|
||||
|
||||
class OpenMoePipelineForwards:
|
||||
"""
|
||||
This class serves as a micro library for forward function substitution of Llama models
|
||||
under pipeline setting.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def openmoe_model_forward(
|
||||
self: OpenMoeModel,
|
||||
input_ids: torch.LongTensor = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
stage_manager: Optional[PipelineStageManager] = None,
|
||||
hidden_states: Optional[torch.FloatTensor] = None,
|
||||
stage_index: Optional[List[int]] = None,
|
||||
past_router_aux_loss: Optional[torch.FloatTensor] = None,
|
||||
past_router_z_loss: Optional[torch.FloatTensor] = None,
|
||||
):
|
||||
# reset moe loss for different data
|
||||
MOE_MANAGER.reset_loss()
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
|
||||
output_hidden_states = (output_hidden_states
|
||||
if output_hidden_states is not None else self.config.output_hidden_states)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
|
||||
return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
|
||||
|
||||
# retrieve input_ids and inputs_embeds
|
||||
if stage_manager.is_first_stage():
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
batch_size, seq_length = input_ids.shape
|
||||
elif inputs_embeds is not None:
|
||||
batch_size, seq_length, _ = inputs_embeds.shape
|
||||
else:
|
||||
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
|
||||
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
hidden_states = inputs_embeds
|
||||
else:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
batch_size, seq_length = input_shape
|
||||
device = hidden_states.device
|
||||
|
||||
seq_length_with_past = seq_length
|
||||
past_key_values_length = 0
|
||||
|
||||
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
|
||||
if output_attentions:
|
||||
logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
|
||||
output_attentions = False
|
||||
if output_hidden_states:
|
||||
logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
|
||||
output_hidden_states = False
|
||||
if use_cache:
|
||||
logger.warning_once("use_cache=True is not supported for pipeline models at the moment.")
|
||||
use_cache = False
|
||||
|
||||
if past_key_values is not None:
|
||||
past_key_values_length = past_key_values[0][0].shape[2]
|
||||
seq_length_with_past = seq_length_with_past + past_key_values_length
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(
|
||||
past_key_values_length,
|
||||
seq_length + past_key_values_length,
|
||||
dtype=torch.long,
|
||||
device=device,
|
||||
)
|
||||
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
|
||||
else:
|
||||
position_ids = position_ids.view(-1, seq_length).long()
|
||||
|
||||
# embed positions, for the first stage, hidden_states is the input embeddings,
|
||||
# for the other stages, hidden_states is the output of the previous stage
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(
|
||||
(batch_size, seq_length_with_past),
|
||||
dtype=torch.bool,
|
||||
device=hidden_states.device,
|
||||
)
|
||||
attention_mask = self._prepare_decoder_attention_mask(
|
||||
attention_mask,
|
||||
(batch_size, seq_length),
|
||||
hidden_states,
|
||||
past_key_values_length,
|
||||
)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
if use_cache:
|
||||
logger.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
|
||||
use_cache = False
|
||||
|
||||
# decoder layers
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attns = () if output_attentions else None
|
||||
next_decoder_cache = () if use_cache else None
|
||||
|
||||
start_idx, end_idx = stage_index[0], stage_index[1]
|
||||
for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
past_key_value = (past_key_values[idx] if past_key_values is not None else None)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
|
||||
def create_custom_forward(module):
|
||||
|
||||
def custom_forward(*inputs):
|
||||
# None for past_key_value
|
||||
return module(*inputs, output_attentions, None)
|
||||
|
||||
return custom_forward
|
||||
|
||||
layer_outputs = torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(decoder_layer),
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
position_ids,
|
||||
None,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
|
||||
if stage_manager.is_last_stage():
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
# add hidden states from the last decoder layer
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
next_cache = next_decoder_cache if use_cache else None
|
||||
|
||||
# concat past losses with current ones
|
||||
router_aux_loss, router_z_loss = MOE_MANAGER.get_loss()
|
||||
if past_router_aux_loss is not None and past_router_z_loss is not None:
|
||||
router_aux_loss = past_router_aux_loss + router_aux_loss
|
||||
router_z_loss = past_router_z_loss + router_z_loss
|
||||
|
||||
if stage_manager.is_last_stage():
|
||||
return tuple([
|
||||
hidden_states,
|
||||
next_cache,
|
||||
all_hidden_states,
|
||||
all_self_attns,
|
||||
router_aux_loss,
|
||||
router_z_loss,
|
||||
])
|
||||
# always return dict for imediate stage
|
||||
return {
|
||||
"hidden_states": hidden_states,
|
||||
"router_aux_loss": router_aux_loss,
|
||||
"router_z_loss": router_z_loss,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def llama_for_causal_lm_forward(
|
||||
self: OpenMoeForCausalLM,
|
||||
input_ids: torch.LongTensor = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
stage_manager: Optional[PipelineStageManager] = None,
|
||||
hidden_states: Optional[torch.FloatTensor] = None,
|
||||
stage_index: Optional[List[int]] = None,
|
||||
chunk_head: Optional[bool] = True,
|
||||
past_router_aux_loss: Optional[torch.FloatTensor] = None,
|
||||
past_router_z_loss: Optional[torch.FloatTensor] = None,
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Returns:
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, LlamaForCausalLM
|
||||
|
||||
>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
||||
|
||||
>>> prompt = "Hey, are you consciours? Can you talk to me?"
|
||||
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
>>> # Generate
|
||||
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
||||
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
"Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
|
||||
```"""
|
||||
logger = logging.get_logger(__name__)
|
||||
output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
|
||||
output_hidden_states = (output_hidden_states
|
||||
if output_hidden_states is not None else self.config.output_hidden_states)
|
||||
return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
|
||||
|
||||
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
|
||||
if output_attentions:
|
||||
logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
|
||||
output_attentions = False
|
||||
if output_hidden_states:
|
||||
logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
|
||||
output_hidden_states = False
|
||||
|
||||
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||
outputs = OpenMoePipelineForwards.openmoe_model_forward(
|
||||
self.model,
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
stage_manager=stage_manager,
|
||||
hidden_states=hidden_states,
|
||||
stage_index=stage_index,
|
||||
past_router_aux_loss=past_router_aux_loss,
|
||||
past_router_z_loss=past_router_z_loss,
|
||||
)
|
||||
|
||||
if stage_manager.is_last_stage():
|
||||
(
|
||||
hidden_states,
|
||||
past_key_values,
|
||||
all_hidden_states,
|
||||
attentions,
|
||||
router_aux_loss,
|
||||
router_z_loss,
|
||||
) = outputs
|
||||
|
||||
if self.pretraining_tp > 1:
|
||||
lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
|
||||
logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
|
||||
logits = torch.cat(logits, dim=-1)
|
||||
|
||||
loss = None
|
||||
# if no training, just do forward
|
||||
if labels is None:
|
||||
logits = self.lm_head(hidden_states)
|
||||
logits = logits.float()
|
||||
# the vocab size for openmoe is 30w+
|
||||
# which causes great activation memory in training, up to 20G for one sequence
|
||||
# so we use chunk and checkpoint to reduce memory
|
||||
else:
|
||||
if chunk_head == True:
|
||||
|
||||
def create_custom_forward(module):
|
||||
|
||||
def custom_forward(*inputs):
|
||||
logits = module(inputs[0])
|
||||
logits = logits.float()
|
||||
# Shift so that tokens < n predict n
|
||||
shift_logits = logits[..., :-1, :].contiguous().float()
|
||||
shift_labels = inputs[1][..., 1:].contiguous()
|
||||
# Flatten the tokens
|
||||
loss = self._calculate_loss(shift_logits, shift_labels)
|
||||
return loss
|
||||
|
||||
return custom_forward
|
||||
|
||||
aux_loss, z_loss = self._calculate_router_loss(router_aux_loss, router_z_loss)
|
||||
loss = aux_loss + z_loss
|
||||
for batch_idx in range(hidden_states.shape[0]):
|
||||
loss = loss + torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(self.lm_head),
|
||||
hidden_states[batch_idx:batch_idx + 1, :],
|
||||
labels[batch_idx:batch_idx + 1, :],
|
||||
)
|
||||
logits = None
|
||||
else:
|
||||
logits = self.lm_head(hidden_states)
|
||||
logits = logits.float()
|
||||
# Shift so that tokens < n predict n
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
# Flatten the tokens
|
||||
aux_loss, z_loss = self._calculate_router_loss(router_aux_loss, router_z_loss)
|
||||
loss = aux_loss + z_loss
|
||||
loss = loss + self._calculate_loss(shift_logits, shift_labels)
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return (loss,) + output if loss is not None else output
|
||||
|
||||
return CausalLMOutputWithPast(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
past_key_values=past_key_values,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=attentions,
|
||||
)
|
||||
else:
|
||||
hidden_states = outputs["hidden_states"]
|
||||
router_aux_loss = outputs["router_aux_loss"]
|
||||
router_z_loss = outputs["router_z_loss"]
|
||||
return {
|
||||
"hidden_states": hidden_states,
|
||||
"past_router_aux_loss": router_aux_loss,
|
||||
"past_router_z_loss": router_z_loss,
|
||||
}
|
Reference in New Issue
Block a user