[moe] merge moe into main (#4978)

* update moe module
* support openmoe
This commit is contained in:
Xuanlei Zhao
2023-11-02 10:21:24 +08:00
committed by GitHub
parent 8993c8a817
commit dc003c304c
67 changed files with 7618 additions and 1657 deletions

View File

@@ -0,0 +1,224 @@
# coding=utf-8
# Copyright 2022 Google LLC and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert T5X checkpoint to PyTorch
Steps:
- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
`gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
- Convert:
```
python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
--pytorch_dump_path=$HOME/t5_1_1_small_pt
```
"""
import argparse
import collections
import torch
from flax import traverse_util
from modeling_openmoe import OpenMoeForCausalLM
from t5x import checkpoints
from transformers import LlamaConfig
from transformers.utils import logging
logging.set_verbosity_info()
def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
"""Returns the KOQV parameters of (self-)attention. Does not transpose."""
k = params[f"{prefix}/layers_{i}/{layer_name}/key/kernel"]
o = params[f"{prefix}/layers_{i}/{layer_name}/out/kernel"]
q = params[f"{prefix}/layers_{i}/{layer_name}/query/kernel"]
v = params[f"{prefix}/layers_{i}/{layer_name}/value/kernel"]
return k, o, q, v
def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
"""Returns the MLP parameters of a layer. Does not transpose."""
if split_mlp_wi:
wi_0 = params[f"{prefix}/layers_{i}/mlp/wi_0/kernel"]
wi_1 = params[f"{prefix}/layers_{i}/mlp/wi_1/kernel"]
wi = (wi_0, wi_1)
else:
wi = params[f"{prefix}/layers_{i}/mlp/wi/kernel"]
wo = params[f"{prefix}/layers_{i}/mlp/wo/kernel"]
return wi, wo
def t5x_extra_mlp_lookup(params, i, prefix, split_mlp_wi=False):
"""Returns the MLP parameters of a layer. Does not transpose."""
if split_mlp_wi:
wi_0 = params[f"{prefix}/layers_{i}/extra_mlp/wi_0/kernel"]
wi_1 = params[f"{prefix}/layers_{i}/extra_mlp/wi_1/kernel"]
wi = (wi_0, wi_1)
else:
wi = params[f"{prefix}/layers_{i}/extra_mlp/wi/kernel"]
wo = params[f"{prefix}/layers_{i}/extra_mlp/wo/kernel"]
return wi, wo
def t5x_experts_lookup(params, i, prefix, split_mlp_wi=False):
"""Returns the MLP parameters of a layer. Does not transpose."""
if split_mlp_wi:
wi_0 = params[f"{prefix}/layers_{i}/mlp/expert/wi_0/kernel"]
wi_1 = params[f"{prefix}/layers_{i}/mlp/expert/wi_1/kernel"]
wi = (wi_0, wi_1)
else:
wi = params[f"{prefix}/layers_{i}/mlp/expert/wi/kernel"]
wo = params[f"{prefix}/layers_{i}/mlp/expert/wo/kernel"]
return wi, wo
def t5x_gate_lookup(params, i, prefix, split_mlp_wi=False):
"""Returns the MLP parameters of a layer. Does not transpose."""
return params[f"{prefix}/layers_{i}/mlp/router/router_weights/w/kernel"]
def t5x_layer_norm_lookup(params, i, prefix, layer_name):
"""Returns the layer norm param of a layer."""
return params[f"{prefix}/layers_{i}/{layer_name}/scale"]
def convert_t5x_to_pytorch(variables: dict, *, num_layers: int, moe_interval: int):
"""Converts the parameters from T5X-Flax to Transformers-PyTorch."""
old = traverse_util.flatten_dict(variables["target"])
old = {"/".join(k): v for k, v in old.items()}
# v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
split_mlp_wi = True
print("Split MLP:", split_mlp_wi)
new = collections.OrderedDict()
print(old.keys())
for key, value in old.items():
print(f"{key}: {value.shape}")
# Shared embeddings.
new["model.embed_tokens.weight"] = old["token_embedder/embedding"]
# Decoder.
for i in range(num_layers):
# Block i, layer 0 (Self Attention).
layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
new[f"model.layers.{i}.input_layernorm.weight"] = layer_norm
new[f"model.layers.{i}.self_attn.k_proj.weight"] = k.T
new[f"model.layers.{i}.self_attn.o_proj.weight"] = o.T
new[f"model.layers.{i}.self_attn.q_proj.weight"] = q.T
new[f"model.layers.{i}.self_attn.v_proj.weight"] = v.T
# Block i, layer 2 (MLP).
layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
new[f"model.layers.{i}.post_attention_layernorm.weight"] = layer_norm
if (i + 1) % moe_interval == 0:
# moe
gate = t5x_gate_lookup(old, i, "decoder", split_mlp_wi)
new[f"model.layers.{i}.mlp.gate_weight"] = gate.T
wi, wo = t5x_experts_lookup(old, i, "decoder", split_mlp_wi)
new[f"model.layers.{i}.mlp.experts.wi_gate"] = wi[0]
new[f"model.layers.{i}.mlp.experts.wi_up"] = wi[1]
new[f"model.layers.{i}.mlp.experts.wo"] = wo
# extra
layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_extra_mlp_layer_norm")
new[f"model.layers.{i}.pre_extra_mlp_layernorm.weight"] = layer_norm
wi, wo = t5x_extra_mlp_lookup(old, i, "decoder", split_mlp_wi)
new[f"model.layers.{i}.extra_mlp.gate_proj.weight"] = wi[0].T
new[f"model.layers.{i}.extra_mlp.up_proj.weight"] = wi[1].T
new[f"model.layers.{i}.extra_mlp.down_proj.weight"] = wo.T
else:
wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
new[f"model.layers.{i}.mlp.gate_proj.weight"] = wi[0].T
new[f"model.layers.{i}.mlp.up_proj.weight"] = wi[1].T
new[f"model.layers.{i}.mlp.down_proj.weight"] = wo.T
new["model.norm.weight"] = old["decoder/decoder_norm/scale"]
# LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
if "decoder/logits_dense/kernel" in old:
new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
return new
def make_state_dict(converted_params):
"""Prepares a state dict for the PyTorch model."""
# Make a state dict with torch tensors.
state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
return state_dict
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path):
"""Replaces the params in model witht the T5X converted params."""
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
converted = convert_t5x_to_pytorch(variables,
num_layers=config.num_hidden_layers,
moe_interval=config.moe_layer_interval)
state_dict = make_state_dict(converted)
model.load_state_dict(state_dict, strict=True)
def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_dump_path):
"""Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
# Initialise PyTorch model
config = LlamaConfig.from_json_file(config_file)
print(f"Building PyTorch model from configuration: {config}")
# Non-v1.1 checkpoints could also use T5Model, but this works for all.
# The v1.0 checkpoints will simply have an LM head that is the word embeddings.
model = OpenMoeForCausalLM(config)
# Load weights from tf checkpoint
load_t5x_weights_in_t5(model, config, t5x_checkpoint_path)
# Save pytorch-model
print(f"Save PyTorch model to {pytorch_dump_path}")
model.save_pretrained(pytorch_dump_path)
# Verify that we can load the checkpoint.
model.from_pretrained(pytorch_dump_path)
print("Done")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
# Required parameters
parser.add_argument("--t5x_checkpoint_path",
default=None,
type=str,
required=True,
help="Path to the T5X checkpoint.")
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
)
parser.add_argument("--pytorch_dump_path",
default=None,
type=str,
required=True,
help="Path to the output PyTorch model.")
args = parser.parse_args()
convert_t5x_checkpoint_to_pytorch(args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path)

View File

@@ -0,0 +1 @@
python convert_openmoe_ckpt.py --t5x_checkpoint_path /path/to/t5x --config_file /path/to/config --pytorch_dump_path /path/to/save

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
{
"architectures": [
"OpenMoeForCausalLM"
],
"intermediate_size": 8192,
"hidden_size": 2048,
"num_hidden_layers": 24,
"head_dim": 128,
"num_attention_heads": 24,
"dropout_rate": 0.0,
"layer_norm_epsilon": 1e-06,
"vocab_size": 256384,
"hidden_act": "swiglu",
"num_experts": 32,
"topk": 2,
"capacity_factor_train": 1.25,
"capacity_factor_eval": 2.0,
"min_capacity": 4,
"noisy_policy": null,
"drop_tks": true,
"expert_parallel": null,
"gated": true,
"moe_layer_interval": 6
}

View File

@@ -0,0 +1,24 @@
{
"architectures": [
"OpenMoeForCausalLM"
],
"intermediate_size": 2048,
"hidden_size": 768,
"num_hidden_layers": 12,
"head_dim": 64,
"num_attention_heads": 12,
"dropout_rate": 0.0,
"layer_norm_epsilon": 1e-06,
"vocab_size": 256384,
"hidden_act": "swiglu",
"num_experts": 16,
"topk": 2,
"capacity_factor_train": 1.25,
"capacity_factor_eval": 2.0,
"min_capacity": 4,
"noisy_policy": null,
"drop_tks": true,
"expert_parallel": null,
"gated": true,
"moe_layer_interval": 4
}

View File

@@ -0,0 +1,562 @@
import warnings
from functools import partial
from typing import Callable, Dict, List, Optional, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Module
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.utils import logging
from colossalai.moe.manager import MOE_MANAGER
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
from .modeling_openmoe import OpenMoeDecoderLayer, OpenMoeForCausalLM, OpenMoeModel
__all__ = ["OpenMoePolicy", "OpenMoeForCausalLMPolicy"]
class OpenMoePolicy(Policy):
def config_sanity_check(self):
pass
def preprocess(self):
if self.shard_config.enable_tensor_parallelism:
# Resize embedding
vocab_size = self.model.config.vocab_size
world_size = self.shard_config.tensor_parallel_size
if vocab_size % world_size != 0:
new_vocab_size = vocab_size + world_size - vocab_size % world_size
self.model.resize_token_embeddings(new_vocab_size)
return self.model
def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
policy = {}
if self.shard_config.enable_sequence_parallelism:
self.shard_config.enable_sequence_parallelism = False
raise NotImplementedError(
"openmoe dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
if self.shard_config.enable_tensor_parallelism:
raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.")
# optimization configuration
if self.shard_config.enable_fused_normalization:
self.append_or_create_submodule_replacement(
description=[
SubModuleReplacementDescription(
suffix="input_layernorm",
target_module=FusedRMSNorm,
),
SubModuleReplacementDescription(
suffix="post_attention_layernorm",
target_module=FusedRMSNorm,
),
SubModuleReplacementDescription(
suffix="pre_extra_mlp_layernorm",
target_module=FusedRMSNorm,
ignore_if_not_exist=True,
),
],
policy=policy,
target_key=OpenMoeDecoderLayer,
)
self.append_or_create_submodule_replacement(
description=SubModuleReplacementDescription(
suffix="norm",
target_module=FusedRMSNorm,
),
policy=policy,
target_key=OpenMoeModel,
)
if self.shard_config.enable_flash_attention:
raise NotImplementedError("Flash attention has already been replaced in openmoe.")
return policy
def postprocess(self):
return self.model
def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
"""If under pipeline parallel setting, replacing the original forward method of huggingface
to customized forward method, and add this changing to policy."""
if self.pipeline_stage_manager:
stage_manager = self.pipeline_stage_manager
if self.model.__class__.__name__ == "OpenMoeModel":
module = self.model
else:
module = self.model.model
layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
method_replacement = {"forward": partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
self.append_or_create_method_replacement(description=method_replacement,
policy=policy,
target_key=model_cls)
return
def get_held_layers(self) -> List[Module]:
"""Get pipeline layers for current stage."""
assert self.pipeline_stage_manager is not None
if self.model.__class__.__name__ == "OpenMoeModel":
module = self.model
else:
module = self.model.model
stage_manager = self.pipeline_stage_manager
held_layers = []
layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
if stage_manager.is_first_stage():
held_layers.append(module.embed_tokens)
start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
held_layers.extend(module.layers[start_idx:end_idx])
if stage_manager.is_last_stage():
held_layers.append(module.norm)
return held_layers
@staticmethod
def distribute_layers(num_layers: int, num_stages: int) -> List[int]:
"""Divide layers into stages
"""
if num_layers == 24 and num_stages == 4:
return [7, 7, 7, 3]
elif num_layers == 24 and num_stages == 2:
return [15, 9]
elif num_layers == 12 and num_stages == 4:
return [5, 5, 5, 1]
elif num_layers == 12 and num_stages == 2:
return [8, 4]
else:
print(f"num_layers: {num_layers}, num_stages: {num_stages} not optimized, use origin pp policy")
return Policy.distribute_layers(num_layers, num_stages)
class OpenMoeModelPolicy(OpenMoePolicy):
def __init__(self) -> None:
super().__init__()
def module_policy(self):
policy = super().module_policy()
if self.pipeline_stage_manager:
# set None as default
self.set_pipeline_forward(
model_cls=OpenMoeModel,
new_forward=OpenMoePipelineForwards.openmoe_model_forward,
policy=policy,
)
return policy
def get_held_layers(self) -> List[Module]:
"""Get pipeline layers for current stage."""
held_layers = super().get_held_layers()
return held_layers
def get_shared_params(self) -> List[Dict[int, Tensor]]:
"""No shared params in llama model"""
return []
class OpenMoeForCausalLMPolicy(OpenMoePolicy):
def module_policy(self):
policy = super().module_policy()
if self.shard_config.enable_tensor_parallelism:
# add a new item for casual lm
new_item = {
OpenMoeForCausalLM:
ModulePolicyDescription(sub_module_replacement=[
SubModuleReplacementDescription(
suffix="lm_head",
target_module=Linear1D_Col,
kwargs=dict(gather_output=True),
)
])
}
policy.update(new_item)
if self.pipeline_stage_manager:
# set None as default
self.set_pipeline_forward(
model_cls=OpenMoeForCausalLM,
new_forward=OpenMoePipelineForwards.llama_for_causal_lm_forward,
policy=policy,
)
return policy
def get_held_layers(self) -> List[Module]:
"""Get pipeline layers for current stage."""
stage_manager = self.pipeline_stage_manager
held_layers = super().get_held_layers()
if stage_manager.is_last_stage():
held_layers.append(self.model.lm_head)
return held_layers
def get_shared_params(self) -> List[Dict[int, Tensor]]:
llama_model = self.model.model
if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
if (id(llama_model.embed_tokens.weight) == id(self.model.lm_head.weight)
and self.pipeline_stage_manager.num_stages > 1):
# tie weights
return [{
0: llama_model.embed_tokens.weight,
self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
}]
return []
class OpenMoePipelineForwards:
"""
This class serves as a micro library for forward function substitution of Llama models
under pipeline setting.
"""
@staticmethod
def openmoe_model_forward(
self: OpenMoeModel,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
stage_manager: Optional[PipelineStageManager] = None,
hidden_states: Optional[torch.FloatTensor] = None,
stage_index: Optional[List[int]] = None,
past_router_aux_loss: Optional[torch.FloatTensor] = None,
past_router_z_loss: Optional[torch.FloatTensor] = None,
):
# reset moe loss for different data
MOE_MANAGER.reset_loss()
logger = logging.get_logger(__name__)
output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
output_hidden_states = (output_hidden_states
if output_hidden_states is not None else self.config.output_hidden_states)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
# retrieve input_ids and inputs_embeds
if stage_manager.is_first_stage():
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
hidden_states = inputs_embeds
else:
input_shape = hidden_states.shape[:-1]
batch_size, seq_length = input_shape
device = hidden_states.device
seq_length_with_past = seq_length
past_key_values_length = 0
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
output_attentions = False
if output_hidden_states:
logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
output_hidden_states = False
if use_cache:
logger.warning_once("use_cache=True is not supported for pipeline models at the moment.")
use_cache = False
if past_key_values is not None:
past_key_values_length = past_key_values[0][0].shape[2]
seq_length_with_past = seq_length_with_past + past_key_values_length
if position_ids is None:
position_ids = torch.arange(
past_key_values_length,
seq_length + past_key_values_length,
dtype=torch.long,
device=device,
)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
else:
position_ids = position_ids.view(-1, seq_length).long()
# embed positions, for the first stage, hidden_states is the input embeddings,
# for the other stages, hidden_states is the output of the previous stage
if attention_mask is None:
attention_mask = torch.ones(
(batch_size, seq_length_with_past),
dtype=torch.bool,
device=hidden_states.device,
)
attention_mask = self._prepare_decoder_attention_mask(
attention_mask,
(batch_size, seq_length),
hidden_states,
past_key_values_length,
)
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
use_cache = False
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = () if use_cache else None
start_idx, end_idx = stage_index[0], stage_index[1]
for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
if output_hidden_states:
all_hidden_states += (hidden_states,)
past_key_value = (past_key_values[idx] if past_key_values is not None else None)
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
# None for past_key_value
return module(*inputs, output_attentions, None)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(decoder_layer),
hidden_states,
attention_mask,
position_ids,
None,
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
if output_attentions:
all_self_attns += (layer_outputs[1],)
if stage_manager.is_last_stage():
hidden_states = self.norm(hidden_states)
# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
# concat past losses with current ones
router_aux_loss, router_z_loss = MOE_MANAGER.get_loss()
if past_router_aux_loss is not None and past_router_z_loss is not None:
router_aux_loss = past_router_aux_loss + router_aux_loss
router_z_loss = past_router_z_loss + router_z_loss
if stage_manager.is_last_stage():
return tuple([
hidden_states,
next_cache,
all_hidden_states,
all_self_attns,
router_aux_loss,
router_z_loss,
])
# always return dict for imediate stage
return {
"hidden_states": hidden_states,
"router_aux_loss": router_aux_loss,
"router_z_loss": router_z_loss,
}
@staticmethod
def llama_for_causal_lm_forward(
self: OpenMoeForCausalLM,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
stage_manager: Optional[PipelineStageManager] = None,
hidden_states: Optional[torch.FloatTensor] = None,
stage_index: Optional[List[int]] = None,
chunk_head: Optional[bool] = True,
past_router_aux_loss: Optional[torch.FloatTensor] = None,
past_router_z_loss: Optional[torch.FloatTensor] = None,
):
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, LlamaForCausalLM
>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
>>> prompt = "Hey, are you consciours? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
```"""
logger = logging.get_logger(__name__)
output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
output_hidden_states = (output_hidden_states
if output_hidden_states is not None else self.config.output_hidden_states)
return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
output_attentions = False
if output_hidden_states:
logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
output_hidden_states = False
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = OpenMoePipelineForwards.openmoe_model_forward(
self.model,
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
stage_manager=stage_manager,
hidden_states=hidden_states,
stage_index=stage_index,
past_router_aux_loss=past_router_aux_loss,
past_router_z_loss=past_router_z_loss,
)
if stage_manager.is_last_stage():
(
hidden_states,
past_key_values,
all_hidden_states,
attentions,
router_aux_loss,
router_z_loss,
) = outputs
if self.pretraining_tp > 1:
lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
logits = torch.cat(logits, dim=-1)
loss = None
# if no training, just do forward
if labels is None:
logits = self.lm_head(hidden_states)
logits = logits.float()
# the vocab size for openmoe is 30w+
# which causes great activation memory in training, up to 20G for one sequence
# so we use chunk and checkpoint to reduce memory
else:
if chunk_head == True:
def create_custom_forward(module):
def custom_forward(*inputs):
logits = module(inputs[0])
logits = logits.float()
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous().float()
shift_labels = inputs[1][..., 1:].contiguous()
# Flatten the tokens
loss = self._calculate_loss(shift_logits, shift_labels)
return loss
return custom_forward
aux_loss, z_loss = self._calculate_router_loss(router_aux_loss, router_z_loss)
loss = aux_loss + z_loss
for batch_idx in range(hidden_states.shape[0]):
loss = loss + torch.utils.checkpoint.checkpoint(
create_custom_forward(self.lm_head),
hidden_states[batch_idx:batch_idx + 1, :],
labels[batch_idx:batch_idx + 1, :],
)
logits = None
else:
logits = self.lm_head(hidden_states)
logits = logits.float()
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
aux_loss, z_loss = self._calculate_router_loss(router_aux_loss, router_z_loss)
loss = aux_loss + z_loss
loss = loss + self._calculate_loss(shift_logits, shift_labels)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=past_key_values,
hidden_states=all_hidden_states,
attentions=attentions,
)
else:
hidden_states = outputs["hidden_states"]
router_aux_loss = outputs["router_aux_loss"]
router_z_loss = outputs["router_z_loss"]
return {
"hidden_states": hidden_states,
"past_router_aux_loss": router_aux_loss,
"past_router_z_loss": router_z_loss,
}