[shardformer] support whisper (#4212)

* support whisper * fix bug in vocabembedding * support downstream model of whisper * update readme
2025-09-08 20:40:34 +00:00 · 2023-07-17 14:25:32 +08:00
parent dd2bf02679
commit 9ee4ebea83
7 changed files with 443 additions and 2 deletions
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -102,7 +102,7 @@ We will follow this roadmap to develop Shardformer:
      - [ ] SwinTransformer
      - [ ] SwinTransformer V2
    - [ ] Audio
-      - [ ] Whisper
+      - [x] Whisper
    - [ ] Multi-modal
      - [ ] To be added

--- a/colossalai/shardformer/layer/embedding.py
+++ b/colossalai/shardformer/layer/embedding.py
@@ -202,7 +202,6 @@ class VocabParallelEmbedding1D(ParallelModule):
        super().__init__()
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
-        self.padding_idx = padding_idx
        self.embed_args = args
        self.embed_kwargs = kwargs
        self.process_group = process_group
@@ -276,6 +275,15 @@ class VocabParallelEmbedding1D(ParallelModule):
            with torch.no_grad():
                self.weight[self.padding_idx - self.vocab_start_index].fill_(0)

+    def _select_padding_idx(self, padding_idx: int):
+        # select padding index according to the rank
+        if padding_idx is None:
+            return None
+        elif padding_idx < self.vocab_end_index and padding_idx >= self.vocab_start_index:
+            return padding_idx - self.vocab_start_index
+        else:
+            return None
+
    def forward(self, input_: Tensor) -> Tensor:
        # Build the mask.
        input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
--- a/colossalai/shardformer/policies/auto_policy.py
+++ b/colossalai/shardformer/policies/auto_policy.py
@@ -105,6 +105,14 @@ _POLICY_LIST = {
    "transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering":
        PolicyLocation(file_name="bloom", class_name="BloomForQuestionAnsweringPolicy"),

+    # Whisper
+    "transformers.models.whisper.modeling_whisper.WhisperModel":
+        PolicyLocation(file_name="whisper", class_name="WhisperModelPolicy"),
+    "transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration":
+        PolicyLocation(file_name="whisper", class_name="WhisperForConditionalGenerationPolicy"),
+    "transformers.models.whisper.modeling_whisper.WhisperForAudioClassification":
+        PolicyLocation(file_name="whisper", class_name="WhisperForAudioClassificationPolicy"),
+
    # Sam
    "transformers.models.sam.modeling_sam.SamModel":
        PolicyLocation(file_name="sam", class_name="SamModelPolicy"),
--- a/colossalai/shardformer/policies/whisper.py
+++ b/colossalai/shardformer/policies/whisper.py
@@ -0,0 +1,232 @@
+import torch.nn as nn
+
+import colossalai.shardformer.layer as col_nn
+
+from .._utils import getattr_, setattr_
+from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = [
+    'WhisperPolicy', 'WhisperModelPolicy', 'WhisperForConditionalGenerationPolicy', 'WhisperForAudioClassification'
+]
+
+
+class WhisperPolicy(Policy):
+
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        # reshape the embedding layer
+        r"""
+        Reshape the Embedding layer to make the embedding dimension divisible by world_size
+        """
+        # TODO:
+        vocab_size = self.model.config.vocab_size
+        world_size = self.shard_config.tensor_parallel_size
+        if vocab_size % world_size != 0:
+            new_vocab_size = vocab_size + world_size - vocab_size % world_size
+            self.model.resize_token_embeddings(new_vocab_size)
+        return self.model
+
+    def module_policy(self):
+        from transformers.models.whisper.modeling_whisper import (
+            WhisperDecoder,
+            WhisperDecoderLayer,
+            WhisperEncoder,
+            WhisperEncoderLayer,
+        )
+
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+            policy[WhisperEncoderLayer] = ModulePolicyDescription(attribute_replacement={
+                "self_attn.embed_dim":
+                    self.model.config.d_model // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads":
+                    self.model.config.encoder_attention_heads // self.shard_config.tensor_parallel_size,
+            },
+                                                                  sub_module_replacement=[
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.q_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.k_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.v_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.out_proj",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc1",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc2",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                  ])
+
+            policy[WhisperDecoderLayer] = ModulePolicyDescription(attribute_replacement={
+                "self_attn.embed_dim":
+                    self.model.config.d_model // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads":
+                    self.model.config.decoder_attention_heads // self.shard_config.tensor_parallel_size,
+                "encoder_attn.embed_dim":
+                    self.model.config.d_model // self.shard_config.tensor_parallel_size,
+                "encoder_attn.num_heads":
+                    self.model.config.encoder_attention_heads // self.shard_config.tensor_parallel_size,
+            },
+                                                                  sub_module_replacement=[
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.q_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.k_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.v_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.out_proj",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.q_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.k_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.v_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.out_proj",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc1",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc2",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                  ])
+
+            policy[WhisperDecoder] = ModulePolicyDescription(sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="embed_tokens",
+                    target_module=col_nn.VocabParallelEmbedding1D,
+                ),
+            ])
+
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            # Handle encoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="self_attn_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="final_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperEncoderLayer)
+
+            # Handle decoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="self_attn_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="final_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperDecoderLayer)
+
+            # handle encoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperEncoder)
+
+            # handle decoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperDecoder)
+        return policy
+
+    def add_lm_head_policy(self, base_policy):
+        from transformers.models.whisper.modeling_whisper import WhisperForConditionalGeneration
+
+        # optimize for tensor parallelism
+        if self.shard_config.enable_tensor_parallelism:
+            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
+                suffix="proj_out", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}),
+                                                        policy=base_policy,
+                                                        target_key=WhisperForConditionalGeneration)
+
+        return base_policy
+
+    def postprocess(self):
+        return self.model
+
+
+# WhisperModel
+class WhisperModelPolicy(WhisperPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+
+# WhisperForConditionalGeneration
+class WhisperForConditionalGenerationPolicy(WhisperPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        module_policy = super().module_policy()
+        module_policy = self.add_lm_head_policy(module_policy)
+        return module_policy
+
+    def postprocess(self):
+        binding_map = {"model.decoder.embed_tokens.weight": "proj_out.weight"}
+        for k, v in binding_map.items():
+            param = getattr_(self.model, k)
+            setattr_(self.model, v, param)
+        return self.model
+
+
+# WhisperForAudioClassification
+class WhisperForAudioClassificationPolicy(WhisperPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()