[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
This commit is contained in:
Hongxin Liu
2023-09-11 16:24:28 +08:00
committed by GitHub
parent 536397cc95
commit 554aa9592e
170 changed files with 781 additions and 758 deletions

View File

@@ -0,0 +1,4 @@
from ._operation import RingAV, RingQK
from .layers import TransformerSelfAttentionRing
__all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']

View File

@@ -0,0 +1,151 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
from torch import distributed as dist
from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication import ring_forward
from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
from colossalai.utils import get_current_device
class RingQK(torch.autograd.Function):
"""
Calculate QK in a ring-exchange style
"""
@staticmethod
@custom_fwd
def forward(ctx, sub_q, sub_k, batch_size, num_attention_heads, sub_seq_length):
# save tensor for backward
ctx.save_for_backward(sub_q, sub_k)
ctx.sub_seq_length = sub_seq_length
# create local segment of attention score
attention_score = torch.empty(batch_size * num_attention_heads,
sub_seq_length,
sub_seq_length * gpc.get_world_size(ParallelMode.SEQUENCE),
dtype=sub_q.dtype,
device=get_current_device())
# compute local QK^T
part_a = torch.matmul(sub_q, sub_k.transpose(2, 1))
local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
local_world_size = gpc.get_world_size(ParallelMode.SEQUENCE)
start_idx = local_rank * sub_seq_length
end_idx = (local_rank + 1) * sub_seq_length
attention_score[:, :, start_idx:end_idx] = part_a
# compute QK^T in ring-all-reduce style
for i in range(local_world_size - 1):
sub_k = ring_forward(sub_k, ParallelMode.SEQUENCE)
start_idx, end_idx = _calc_incoming_device_range(i, local_rank, local_world_size, sub_seq_length)
part_a = torch.matmul(sub_q, sub_k.transpose(2, 1))
attention_score[:, :, start_idx:end_idx] = part_a
return attention_score
@staticmethod
@custom_bwd
def backward(ctx, grad_output):
sub_q, sub_k, = ctx.saved_tensors
local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
local_world_size = gpc.get_world_size(ParallelMode.SEQUENCE)
# calculate gradient of sub_k
grad_k = torch.matmul(grad_output.transpose(2, 1), sub_q)
dist.all_reduce(grad_k, group=gpc.get_group(ParallelMode.SEQUENCE))
grad_k = grad_k[:, local_rank * ctx.sub_seq_length:(local_rank + 1) * ctx.sub_seq_length]
grad_k /= local_world_size
# calculate gradient for sub_q
grad_q = torch.zeros_like(
sub_q,
dtype=sub_q.dtype,
device=get_current_device(),
)
# compute with local sub_k
start_idx, end_idx = _calc_current_device_range(local_rank, ctx.sub_seq_length)
grad_q += torch.matmul(grad_output[:, :, start_idx:end_idx], sub_k)
# compute QK^T in ring-all-reduce style
for i in range(local_world_size - 1):
sub_k = ring_forward(sub_k, ParallelMode.SEQUENCE)
start_idx, end_idx = _calc_incoming_device_range(i, local_rank, local_world_size, ctx.sub_seq_length)
grad_q += torch.matmul(grad_output[:, :, start_idx:end_idx], sub_k)
grad_q /= local_world_size
return grad_q, grad_k, None, None, None
class RingAV(torch.autograd.Function):
"""
Calculate AV in a ring-exchange style
"""
@staticmethod
@custom_fwd
def forward(ctx, attention_score, sub_v, batch_size, num_attention_heads, attention_head_size, sub_seq_length):
local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
local_world_size = gpc.get_world_size(ParallelMode.SEQUENCE)
local_start_idx, local_end_idx = _calc_current_device_range(local_rank, sub_seq_length)
sub_attention_result = torch.zeros(batch_size * num_attention_heads,
sub_seq_length,
attention_head_size,
device=get_current_device(),
dtype=attention_score.dtype)
# save tensors for backward
ctx.save_for_backward(attention_score, sub_v)
ctx.sub_seq_length = sub_seq_length
# compute local AV
part_av = torch.matmul(attention_score[:, :, local_start_idx:local_end_idx], sub_v)
sub_attention_result += part_av
# compute AV in ring - all - reduce style
for i in range(local_world_size - 1):
sub_v = ring_forward(sub_v, ParallelMode.SEQUENCE)
start_idx, end_idx = _calc_incoming_device_range(i, local_rank, local_world_size, sub_seq_length)
# compute QK^T
part_av = torch.matmul(attention_score[:, :, start_idx:end_idx], sub_v)
sub_attention_result += part_av
return sub_attention_result
@staticmethod
@custom_bwd
def backward(ctx, grad_output):
local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
local_world_size = gpc.get_world_size(ParallelMode.SEQUENCE)
local_start_idx, local_end_idx = _calc_current_device_range(local_rank, ctx.sub_seq_length)
attention_scores, sub_v = ctx.saved_tensors
# calculate gradient of v
grad_v = torch.matmul(attention_scores.transpose(2, 1), grad_output)
dist.all_reduce(grad_v, group=gpc.get_group(ParallelMode.SEQUENCE))
grad_v = grad_v[:, local_start_idx:local_end_idx]
grad_v /= local_world_size
# calculate gradient for attention score
grad_attention_score = torch.zeros_like(attention_scores, dtype=grad_output.dtype, device=get_current_device())
# compute with local sub_k
grad_attention_score[:, :, local_start_idx:local_end_idx] += torch.matmul(grad_output, sub_v.transpose(2, 1))
# compute QK^T in ring-all-reduce style
for i in range(local_world_size - 1):
sub_v = ring_forward(sub_v, ParallelMode.SEQUENCE)
start_idx, end_idx = _calc_incoming_device_range(i, local_rank, local_world_size, ctx.sub_seq_length)
# compute grad_q
grad_attention_score[:, :, start_idx:end_idx] += torch.matmul(grad_output, sub_v.transpose(2, 1))
return grad_attention_score, grad_v, None, None, None, None

View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
def _calc_incoming_device_range(i, rank, world_size, sub_seq_length):
device_of_incoming_k = (rank - i - 1) % world_size
start_idx = sub_seq_length * device_of_incoming_k
end_idx = sub_seq_length * (device_of_incoming_k + 1)
return start_idx, end_idx
def _calc_current_device_range(rank, sub_seq_length):
start_idx = sub_seq_length * rank
end_idx = sub_seq_length * (rank + 1)
return start_idx, end_idx

View File

@@ -0,0 +1,237 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import colossalai
from colossalai.context import seed
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel import FusedScaleMaskSoftmax
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
from colossalai.legacy.registry import LAYERS
@LAYERS.register_module
class TransformerSelfAttentionRing(nn.Module):
"""Parallel self-attention layer abstract class.
Self-attention layer takes input with size [b, s, h]
and returns output of the same size.
Args:
hidden_size (int): hidden size.
num_attention_heads (int): number of attention heads.
attention_dropout (float): dropout probability for attention layer.
attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
layer_number (int): number of layers.
"""
def __init__(self,
hidden_size,
num_attention_heads,
attention_dropout,
attention_mask_func,
layer_number,
apply_query_key_layer_scaling: bool = False,
convert_fp16_to_fp32_in_softmax: bool = False,
attn_mask_type=AttnMaskType.padding,
masked_softmax_fusion=True,
fp16=False,
bf16=False):
super().__init__()
self.convert_fp16_to_fp32_in_softmax = convert_fp16_to_fp32_in_softmax
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
self.attention_mask_func = attention_mask_func
self.layer_number = layer_number
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.attn_mask_type = attn_mask_type
assert self.layer_number > 0
self.attention_dropout = attention_dropout
if self.apply_query_key_layer_scaling:
self.convert_fp16_to_fp32_in_softmax = True
assert self.hidden_size % self.num_attention_heads == 0, \
'hidden size is not divisible by the number of attention heads'
self.hidden_size_per_attention_head = self.hidden_size // num_attention_heads
self.world_size = gpc.get_world_size(ParallelMode.SEQUENCE)
# Strided linear layer.
self.query_key_value = _Linear(
hidden_size,
3 * self.hidden_size,
)
self.coeff = None
self.norm_factor = math.sqrt(self.hidden_size)
if self.apply_query_key_layer_scaling:
self.coeff = layer_number
self.norm_factor *= self.coeff
self.scale_mask_softmax = FusedScaleMaskSoftmax(fp16, bf16, self.attn_mask_type, masked_softmax_fusion,
self.attention_mask_func, self.convert_fp16_to_fp32_in_softmax,
self.coeff)
self.attention_dropout = nn.Dropout(attention_dropout)
# Output.
self.dense = _Linear(hidden_size, hidden_size, bias=True, skip_bias_add=True)
def forward(self, hidden_states, attention_mask):
# hidden_states: [sub_seq_len, batch_size, hidden_size]
# attention_mask: [batch_size, 1, sub_seq_len, seq_len]
sub_seq_length, batch_size, hidden_size = hidden_states.size()
# =====================
# Query, Key, and Value
# =====================
# Attention heads shape change:
# [sub_seq_len, batch_size, hidden_size] --> [sub_seq_len, batch_size, (3 * head_size * num_heads)]
mixed_x_layer = self.query_key_value(hidden_states)
# [sub_seq_len, batch_size, num_heads, 3 * head_size] --> 3 [sub_seq_len, batch_size, num_heads, head_size]
new_tensor_shape = mixed_x_layer.size()[:-1] + (self.num_attention_heads,
3 * self.hidden_size_per_attention_head)
mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
# split into query, key and value
last_dim = mixed_x_layer.dim() - 1
last_dim_value = mixed_x_layer.size(-1)
assert last_dim_value % 3 == 0, 'the last dimension is not a multiple of 3, ' \
'cannot be divided into query, key and value'
partition_size = last_dim_value // 3
(query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, partition_size, dim=last_dim)
# attention scores: [batch_size, num_heads, sub_seq_len, seq_len]
output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0),
key_layer.size(0) * self.world_size)
# [sub_seq_len, batch_size, num_heads, head_size] -> [sub_seq_len, batch_size * num_heads, head_size]
query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
# [sub_seq_len, batch_size, num_heads, head_size] -> [sub_seq_len, batch_size * num_heads, head_size]
key_layer = key_layer.view(key_layer.size(0), output_size[0] * output_size[1], -1)
# attention_scores: [batch_size * num_heads, sub_seq_len, seq_len]
attention_scores = RingQK.apply(
query_layer.transpose(0, 1).contiguous(), # [batch_size * num_heads, sub_seq_len, head_size]
key_layer.transpose(0, 1).contiguous(), # [batch_size * num_heads, sub_seq_len, head_size],
batch_size,
self.num_attention_heads,
sub_seq_length)
attention_scores /= self.norm_factor
# change view to [batch_size, num_heads, sub_seq_len, seq_len]
attention_scores = attention_scores.view(*output_size)
# change shape to [batch_size, num_heads, sub_seq_len, seq_len]
attention_probs = self.scale_mask_softmax(attention_scores, attention_mask)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
with seed(ParallelMode.TENSOR):
attention_probs = self.attention_dropout(attention_probs)
# context layer shape: [batch_size, num_heads, sub_seq_len, head_size]
output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
# change view [sub_seq_len, batch_size * num_heads, head_size]
value_layer = value_layer.contiguous().view(value_layer.size(0), output_size[0] * output_size[1], -1)
# # change view [b * num_heads, sub_seq_len, seq_len]
attention_probs = attention_probs.view(
attention_probs.size(0) * attention_probs.size(1), attention_probs.size(2), attention_probs.size(3))
# matmul: [batch_size * num_heads, sub_seq_len, head_size]
context_layer = RingAV.apply(attention_probs,
value_layer.transpose(0, 1).contiguous(), batch_size, self.num_attention_heads,
self.hidden_size_per_attention_head, sub_seq_length)
# change view [batch_size, num_heads, sub_seq_len, head_size]
context_layer = context_layer.view(*output_size)
# [batch_size, num_heads, sub_seq_len, head_size] -> [sub_seq_len, batch_size, num_heads, head_size]
context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
# [sub_seq_len, batch_size, num_heads, head_size] -> [sub_seq_len, batch_size, hidden_size]
new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_attention_head *
self.num_attention_heads,)
context_layer = context_layer.view(*new_context_layer_shape)
output, bias = self.dense(context_layer)
return output, bias
def __repr__(self):
return f'TransformerSelfAttentionRing(apply_query_key_layer_scaling={self.apply_query_key_layer_scaling}, ' \
f'layer_number={self.layer_number}, hidden_size:{self.hidden_size}, attention_dropout={self.attention_dropout}, ' \
f'attn_mask_type={self.attn_mask_type}, num_attention_heads={self.num_attention_heads}, ' \
f'hidden_size_per_attention_head={self.hidden_size_per_attention_head}, coeff={self.coeff}, norm_factor={self.norm_factor}, ' \
f'convert_fp16_to_fp32_in_softmax={self.convert_fp16_to_fp32_in_softmax})'
class _Linear(nn.Module):
"""Linear layer with column parallelism.
The linear layer is defined as Y = XA + b. A is parallelized along
its second dimension as A = [A_1, ..., A_p].
Arguments:
input_size: first dimension of matrix A.
output_size: second dimension of matrix A.
bias: If true, add bias
init_method: method to initialize weights. Note that bias is always set
to zero.
stride: For the strided linear layers.
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: This was added to enable performance optimizations where bias
can be fused with other elementwise operations. we skip
adding bias but instead return it.
"""
def __init__(self, input_size, output_size, bias=True, skip_bias_add=False):
super(_Linear, self).__init__()
# Keep input parameters
self.input_size = input_size
self.output_size = output_size
self.skip_bias_add = skip_bias_add
self.weight = Parameter(torch.empty(
self.output_size,
self.input_size,
))
nn.init.xavier_normal_(self.weight)
if bias:
self.bias = Parameter(torch.empty(self.output_size))
# Always initialize bias to zero.
with torch.no_grad():
self.bias.zero_()
else:
self.register_parameter('bias', None)
def forward(self, input_):
# Matrix multiply.
bias = self.bias if not self.skip_bias_add else None
output = F.linear(input_, self.weight, bias)
if self.skip_bias_add:
return output, self.bias
else:
return output
def __repr__(self):
return f'Linear(in_features={self.input_size}, out_features={self.output_size}, ' + \
f'bias={self.bias is not None}, skip_bias_add={self.skip_bias_add})'