mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-28 08:17:57 +00:00
[NFC] polish colossalai/nn/layer/parallel_2p5d/layers.py code style (#972)
This commit is contained in:
parent
632e94abde
commit
598cde4a0f
@ -189,7 +189,7 @@ class Linear2p5D(ParallelLayer):
|
|||||||
def forward(self, x: Tensor) -> Tensor:
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
# input: [m/dq, n/q, k/q]
|
# input: [m/dq, n/q, k/q]
|
||||||
# output: [m/dq, n/q, h/q]
|
# output: [m/dq, n/q, h/q]
|
||||||
out_shape = x.shape[:-1] + (self.hidden_size_per_partition, )
|
out_shape = x.shape[:-1] + (self.hidden_size_per_partition,)
|
||||||
|
|
||||||
output = Matmul_AB_2p5D.apply(
|
output = Matmul_AB_2p5D.apply(
|
||||||
x,
|
x,
|
||||||
@ -254,7 +254,7 @@ class LayerNorm2p5D(ParallelLayer):
|
|||||||
self.tesseract_dim, _ = get_tesseract_dim_dep_from_env()
|
self.tesseract_dim, _ = get_tesseract_dim_dep_from_env()
|
||||||
|
|
||||||
# partitioning dimension
|
# partitioning dimension
|
||||||
self.partitioned_partition = divide(normalized_shape, self.tesseract_dim) # *
|
self.partitioned_partition = divide(normalized_shape, self.tesseract_dim) # *
|
||||||
|
|
||||||
# create parameters
|
# create parameters
|
||||||
factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
|
factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
|
||||||
@ -357,16 +357,16 @@ class LayerNorm2p5D(ParallelLayer):
|
|||||||
|
|
||||||
def forward(self, x: Tensor) -> Tensor:
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
E_x = torch.sum(x, dim=-1, keepdim=True) # [b/q, s, 1]
|
E_x = torch.sum(x, dim=-1, keepdim=True) # [b/q, s, 1]
|
||||||
torch.distributed.all_reduce(E_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
|
torch.distributed.all_reduce(E_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
|
||||||
E_x /= self.normalized_shape
|
E_x /= self.normalized_shape
|
||||||
|
|
||||||
# Var_x in the block below is the sum of input^2
|
# Var_x in the block below is the sum of input^2
|
||||||
Var_x = torch.sum(x * x, dim=-1, keepdim=True) # [b/q, s, 1]
|
Var_x = torch.sum(x * x, dim=-1, keepdim=True) # [b/q, s, 1]
|
||||||
torch.distributed.all_reduce(Var_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
|
torch.distributed.all_reduce(Var_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
|
||||||
Var_x /= self.normalized_shape
|
Var_x /= self.normalized_shape
|
||||||
|
|
||||||
Var_x = Var_x - E_x * E_x # variance of x [b/q, s, 1]
|
Var_x = Var_x - E_x * E_x # variance of x [b/q, s, 1]
|
||||||
# this time 1/sqrt(Var_x + epsilon)
|
# this time 1/sqrt(Var_x + epsilon)
|
||||||
Var_x = 1.0 / torch.sqrt(Var_x + self.variance_epsilon)
|
Var_x = 1.0 / torch.sqrt(Var_x + self.variance_epsilon)
|
||||||
|
|
||||||
@ -589,7 +589,7 @@ class PatchEmbedding2p5D(ParallelLayer):
|
|||||||
|
|
||||||
output = F.conv2d(input_, weight, bias, stride=self.patch_size)
|
output = F.conv2d(input_, weight, bias, stride=self.patch_size)
|
||||||
if self.flatten:
|
if self.flatten:
|
||||||
output = output.flatten(2).transpose(1, 2) # BCHW -> BNC
|
output = output.flatten(2).transpose(1, 2) # BCHW -> BNC
|
||||||
|
|
||||||
cls_token = all_gather_tensor_2p5d(self.cls_token, -1, ParallelMode.PARALLEL_2P5D_COL)
|
cls_token = all_gather_tensor_2p5d(self.cls_token, -1, ParallelMode.PARALLEL_2P5D_COL)
|
||||||
pos_embed = all_gather_tensor_2p5d(self.pos_embed, -1, ParallelMode.PARALLEL_2P5D_COL)
|
pos_embed = all_gather_tensor_2p5d(self.pos_embed, -1, ParallelMode.PARALLEL_2P5D_COL)
|
||||||
@ -1038,7 +1038,7 @@ class Classifier2p5D(ParallelLayer):
|
|||||||
destination.update(local_state)
|
destination.update(local_state)
|
||||||
|
|
||||||
def forward(self, input_: Tensor) -> Tensor:
|
def forward(self, input_: Tensor) -> Tensor:
|
||||||
out_shape = input_.shape[:-1] + (self.num_classes, )
|
out_shape = input_.shape[:-1] + (self.num_classes,)
|
||||||
|
|
||||||
return classifier_2p5d(input_, self.weight, self.bias, self.tesseract_dim, out_shape, self.row_rank,
|
return classifier_2p5d(input_, self.weight, self.bias, self.tesseract_dim, out_shape, self.row_rank,
|
||||||
self.col_rank, ParallelMode.PARALLEL_2P5D_ROW, ParallelMode.PARALLEL_2P5D_COL,
|
self.col_rank, ParallelMode.PARALLEL_2P5D_ROW, ParallelMode.PARALLEL_2P5D_COL,
|
||||||
@ -1172,7 +1172,7 @@ class VocabParallelClassifier2p5D(ParallelLayer):
|
|||||||
def forward(self, x: Tensor) -> Tensor:
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
# input: [m/dq, n/q, k/q]
|
# input: [m/dq, n/q, k/q]
|
||||||
# output: [m/dq, n/q, h/q]
|
# output: [m/dq, n/q, h/q]
|
||||||
out_shape = x.shape[:-1] + (self.hidden_size_per_partition, )
|
out_shape = x.shape[:-1] + (self.hidden_size_per_partition,)
|
||||||
|
|
||||||
output = Matmul_ABT_2p5D.apply(
|
output = Matmul_ABT_2p5D.apply(
|
||||||
x,
|
x,
|
||||||
|
Loading…
Reference in New Issue
Block a user