ColossalAI/examples/tutorial/sequence_parallel/model/layers/head.py
Hongxin Liu b5f9e37c70
[legacy] clean up legacy code (#4743)
* [legacy] remove outdated codes of pipeline (#4692)

* [legacy] remove cli of benchmark and update optim (#4690)

* [legacy] remove cli of benchmark and update optim

* [doc] fix cli doc test

* [legacy] fix engine clip grad norm

* [legacy] remove outdated colo tensor (#4694)

* [legacy] remove outdated colo tensor

* [test] fix test import

* [legacy] move outdated zero to legacy (#4696)

* [legacy] clean up utils (#4700)

* [legacy] clean up utils

* [example] update examples

* [legacy] clean up amp

* [legacy] fix amp module

* [legacy] clean up gpc (#4742)

* [legacy] clean up context

* [legacy] clean core, constants and global vars

* [legacy] refactor initialize

* [example] fix examples ci

* [example] fix examples ci

* [legacy] fix tests

* [example] fix gpt example

* [example] fix examples ci

* [devops] fix ci installation

* [example] fix examples ci
2023-09-18 16:31:06 +08:00

82 lines
2.4 KiB
Python

import torch
import torch.nn as nn
import torch.nn.functional as F
from loss_func.cross_entropy import vocab_cross_entropy
import colossalai
from colossalai.kernel import LayerNorm
from colossalai.legacy.context import ParallelMode
from colossalai.legacy.core import global_context as gpc
from .embedding import VocabEmbedding
from .linear import Linear
from .pooler import Pooler
class BertLMHead(nn.Module):
"""Masked LM head for Bert
Arguments:
hidden_size: hidden size
init_method: init method for weight initialization
layernorm_epsilon: tolerance for layer norm divisions
"""
def __init__(
self,
vocab_size,
hidden_size,
):
super(BertLMHead, self).__init__()
self.bias = torch.nn.Parameter(torch.zeros(vocab_size))
self.dense = Linear(hidden_size, hidden_size)
self.layernorm = LayerNorm(hidden_size)
self.gelu = torch.nn.functional.gelu
def forward(self, hidden_states, word_embeddings_weight, lm_labels):
hidden_states = self.dense(hidden_states)
hidden_states = self.gelu(hidden_states)
hidden_states = self.layernorm(hidden_states)
output = F.linear(hidden_states, word_embeddings_weight, self.bias)
lm_loss = vocab_cross_entropy(output, lm_labels)
return lm_loss
class BertBinaryHead(nn.Module):
def __init__(self, hidden_size):
super().__init__()
self.pooler = Pooler(hidden_size)
self.dense = Linear(hidden_size, 2)
def forward(self, hidden_states):
if gpc.get_local_rank(ParallelMode.SEQUENCE) == 0:
output = self.pooler(hidden_states)
output = self.dense(output)
else:
output = None
return output
class BertDualHead(nn.Module):
def __init__(self, hidden_size, vocab_size, add_binary_head):
super().__init__()
self.lm_head = BertLMHead(vocab_size, hidden_size)
self.add_binary_head = add_binary_head
if add_binary_head:
self.binary_head = BertBinaryHead(hidden_size)
else:
self.binary_head = None
def forward(self, hidden_states, word_embeddings_weight, lm_labels):
if self.add_binary_head:
binary_output = self.binary_head(hidden_states)
else:
binary_output = None
lm_loss = self.lm_head(hidden_states, word_embeddings_weight, lm_labels)
return lm_loss, binary_output