[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
This commit is contained in:
Hongxin Liu
2023-09-19 14:20:26 +08:00
committed by GitHub
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions

View File

@@ -1,4 +1,4 @@
from .embedding import VocabEmbedding, Embedding
from .bert_layer import BertLayer
from .embedding import Embedding, VocabEmbedding
from .head import BertDualHead
from .preprocess import PreProcessor

View File

@@ -20,18 +20,20 @@ class BertLayer(nn.Module):
output of the same size.
"""
def __init__(self,
layer_number,
hidden_size,
num_attention_heads,
attention_dropout,
mlp_ratio,
hidden_dropout,
is_naive_fp16,
apply_residual_connection_post_layernorm=False,
fp32_residual_connection=False,
bias_dropout_fusion: bool = True,
convert_fp16_to_fp32_in_softmax: bool = False):
def __init__(
self,
layer_number,
hidden_size,
num_attention_heads,
attention_dropout,
mlp_ratio,
hidden_dropout,
is_naive_fp16,
apply_residual_connection_post_layernorm=False,
fp32_residual_connection=False,
bias_dropout_fusion: bool = True,
convert_fp16_to_fp32_in_softmax: bool = False,
):
super().__init__()
self.layer_number = layer_number
@@ -50,7 +52,8 @@ class BertLayer(nn.Module):
layer_number=layer_number,
apply_query_key_layer_scaling=True,
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
fp16=is_naive_fp16)
fp16=is_naive_fp16,
)
self.hidden_dropout = hidden_dropout
self.bias_dropout_fusion = bias_dropout_fusion
@@ -90,8 +93,9 @@ class BertLayer(nn.Module):
# re-enable torch grad to enable fused optimization.
with torch.enable_grad():
layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual,
self.hidden_dropout)
layernorm_input = bias_dropout_add_func(
attention_output, attention_bias.expand_as(residual), residual, self.hidden_dropout
)
# Layer norm post the self attention.
layernorm_output = self.post_attention_layernorm(layernorm_input)

View File

@@ -1,5 +1,6 @@
import torch
def bias_dropout_add(x, bias, residual, prob, training):
# type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
@@ -10,4 +11,5 @@ def bias_dropout_add(x, bias, residual, prob, training):
def get_bias_dropout_add(training):
def _bias_dropout_add(x, bias, residual, prob):
return bias_dropout_add(x, bias, residual, prob, training)
return _bias_dropout_add
return _bias_dropout_add

View File

@@ -5,7 +5,6 @@ import torch.nn.init as init
class VocabEmbedding(torch.nn.Module):
def __init__(self, num_embeddings, embedding_dim):
super(VocabEmbedding, self).__init__()
# Keep the input dimensions.
@@ -13,26 +12,29 @@ class VocabEmbedding(torch.nn.Module):
self.embedding_dim = embedding_dim
self.padding_idx = None
self.max_norm = None
self.norm_type = 2.
self.norm_type = 2.0
self.scale_grad_by_freq = False
self.sparse = False
self._weight = None
# Allocate weights and initialize.
self.weight = nn.Parameter(torch.empty(
self.num_embeddings, self.embedding_dim))
self.weight = nn.Parameter(torch.empty(self.num_embeddings, self.embedding_dim))
init.xavier_uniform_(self.weight)
def forward(self, hidden_state):
output = F.embedding(hidden_state, self.weight,
self.padding_idx, self.max_norm,
self.norm_type, self.scale_grad_by_freq,
self.sparse)
output = F.embedding(
hidden_state,
self.weight,
self.padding_idx,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.sparse,
)
return output
def __repr__(self):
return f'VocabEmbedding(num_embeddings={self.num_embeddings}, ' \
f'embedding_dim={self.embedding_dim})'
return f"VocabEmbedding(num_embeddings={self.num_embeddings}, " f"embedding_dim={self.embedding_dim})"
class Embedding(nn.Module):
@@ -48,12 +50,7 @@ class Embedding(nn.Module):
will ignore this embedding
"""
def __init__(self,
hidden_size,
vocab_size,
max_sequence_length,
embedding_dropout_prob,
num_tokentypes):
def __init__(self, hidden_size, vocab_size, max_sequence_length, embedding_dropout_prob, num_tokentypes):
super(Embedding, self).__init__()
self.hidden_size = hidden_size
@@ -62,16 +59,14 @@ class Embedding(nn.Module):
self.word_embeddings = VocabEmbedding(vocab_size, self.hidden_size)
# Position embedding (serial).
self.position_embeddings = torch.nn.Embedding(
max_sequence_length, self.hidden_size)
self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size)
# Token type embedding.
# Add this as an optional field that can be added through
# method call so we can load a pretrain model without
# token types and add them as needed.
if self.num_tokentypes > 0:
self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
self.hidden_size)
self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size)
else:
self.tokentype_embeddings = None

View File

@@ -3,12 +3,10 @@ import torch.nn as nn
import torch.nn.functional as F
from loss_func.cross_entropy import vocab_cross_entropy
import colossalai
from colossalai.kernel import LayerNorm
from colossalai.legacy.context import ParallelMode
from colossalai.legacy.core import global_context as gpc
from .embedding import VocabEmbedding
from .linear import Linear
from .pooler import Pooler
@@ -26,7 +24,6 @@ class BertLMHead(nn.Module):
vocab_size,
hidden_size,
):
super(BertLMHead, self).__init__()
self.bias = torch.nn.Parameter(torch.zeros(vocab_size))
@@ -46,7 +43,6 @@ class BertLMHead(nn.Module):
class BertBinaryHead(nn.Module):
def __init__(self, hidden_size):
super().__init__()
self.pooler = Pooler(hidden_size)
@@ -62,7 +58,6 @@ class BertBinaryHead(nn.Module):
class BertDualHead(nn.Module):
def __init__(self, hidden_size, vocab_size, add_binary_head):
super().__init__()
self.lm_head = BertLMHead(vocab_size, hidden_size)

View File

@@ -1,6 +1,8 @@
import torch
import math
import torch
def init_normal(tensor, sigma):
"""Init method based on N(0, sigma)."""
torch.nn.init.normal_(tensor, mean=0.0, std=sigma)

View File

@@ -1,8 +1,8 @@
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
import torch.nn.init as init
from torch.nn import Parameter
class Linear(nn.Module):
@@ -24,11 +24,7 @@ class Linear(nn.Module):
adding bias but instead return it.
"""
def __init__(self,
input_size,
output_size,
bias=True,
skip_bias_add=False):
def __init__(self, input_size, output_size, bias=True, skip_bias_add=False):
super(Linear, self).__init__()
# Keep input parameters
@@ -36,9 +32,12 @@ class Linear(nn.Module):
self.output_size = output_size
self.skip_bias_add = skip_bias_add
self.weight = Parameter(torch.empty(self.output_size,
self.input_size,
))
self.weight = Parameter(
torch.empty(
self.output_size,
self.input_size,
)
)
init.normal_(self.weight)
if bias:
self.bias = Parameter(torch.empty(self.output_size))
@@ -46,7 +45,7 @@ class Linear(nn.Module):
with torch.no_grad():
self.bias.zero_()
else:
self.register_parameter('bias', None)
self.register_parameter("bias", None)
def forward(self, input_):
# Matrix multiply.
@@ -59,5 +58,7 @@ class Linear(nn.Module):
return output
def __repr__(self):
return f'Linear(in_features={self.input_size}, out_features={self.output_size}, ' + \
f'bias={self.bias is not None}, skip_bias_add={self.skip_bias_add})'
return (
f"Linear(in_features={self.input_size}, out_features={self.output_size}, "
+ f"bias={self.bias is not None}, skip_bias_add={self.skip_bias_add})"
)

View File

@@ -1,10 +1,10 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from .linear import Linear
from colossalai.kernel.jit import bias_gelu_impl
from .linear import Linear
class TransformerMLP(nn.Module):
"""MLP.
@@ -18,19 +18,13 @@ class TransformerMLP(nn.Module):
super(TransformerMLP, self).__init__()
# Project to 4h.
self.dense_h_to_4h = Linear(
hidden_size,
int(hidden_size*mlp_ratio),
skip_bias_add=True)
self.dense_h_to_4h = Linear(hidden_size, int(hidden_size * mlp_ratio), skip_bias_add=True)
self.bias_gelu_fusion = fuse_gelu
self.activation_func = F.gelu
# Project back to h.
self.dense_4h_to_h = Linear(
int(hidden_size*mlp_ratio),
hidden_size,
skip_bias_add=True)
self.dense_4h_to_h = Linear(int(hidden_size * mlp_ratio), hidden_size, skip_bias_add=True)
def forward(self, hidden_states):
# hidden states should be in the shape of [s, b, h]
@@ -39,11 +33,9 @@ class TransformerMLP(nn.Module):
intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
if self.bias_gelu_fusion:
intermediate_parallel = \
bias_gelu_impl(intermediate_parallel, bias_parallel)
intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
else:
intermediate_parallel = \
self.activation_func(intermediate_parallel + bias_parallel)
intermediate_parallel = self.activation_func(intermediate_parallel + bias_parallel)
# [s, b, h]
output, output_bias = self.dense_4h_to_h(intermediate_parallel)

View File

@@ -1,5 +1,6 @@
import torch
import torch.nn as nn
from .linear import Linear

View File

@@ -6,7 +6,6 @@ from colossalai.legacy.core import global_context as gpc
class PreProcessor(nn.Module):
def __init__(self, sub_seq_length):
super().__init__()
self.sub_seq_length = sub_seq_length
@@ -15,10 +14,9 @@ class PreProcessor(nn.Module):
# Create position ids
seq_length = token_ids.size(1)
local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
position_ids = torch.arange(seq_length * local_rank,
seq_length * (local_rank + 1),
dtype=torch.long,
device=token_ids.device)
position_ids = torch.arange(
seq_length * local_rank, seq_length * (local_rank + 1), dtype=torch.long, device=token_ids.device
)
position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
return position_ids
@@ -42,7 +40,7 @@ class PreProcessor(nn.Module):
extended_attention_mask = attention_mask_bss.unsqueeze(1)
# Convert attention mask to binary:
extended_attention_mask = (extended_attention_mask < 0.5)
extended_attention_mask = extended_attention_mask < 0.5
return extended_attention_mask