From 598cde4a0f2a07682a351371d0a55fd83923eac7 Mon Sep 17 00:00:00 2001 From: Zirui Zhu Date: Mon, 16 May 2022 11:27:33 +0800 Subject: [PATCH] [NFC] polish colossalai/nn/layer/parallel_2p5d/layers.py code style (#972) --- colossalai/nn/layer/parallel_2p5d/layers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/nn/layer/parallel_2p5d/layers.py index d89150642..f26efcc61 100644 --- a/colossalai/nn/layer/parallel_2p5d/layers.py +++ b/colossalai/nn/layer/parallel_2p5d/layers.py @@ -189,7 +189,7 @@ class Linear2p5D(ParallelLayer): def forward(self, x: Tensor) -> Tensor: # input: [m/dq, n/q, k/q] # output: [m/dq, n/q, h/q] - out_shape = x.shape[:-1] + (self.hidden_size_per_partition, ) + out_shape = x.shape[:-1] + (self.hidden_size_per_partition,) output = Matmul_AB_2p5D.apply( x, @@ -254,7 +254,7 @@ class LayerNorm2p5D(ParallelLayer): self.tesseract_dim, _ = get_tesseract_dim_dep_from_env() # partitioning dimension - self.partitioned_partition = divide(normalized_shape, self.tesseract_dim) # * + self.partitioned_partition = divide(normalized_shape, self.tesseract_dim) # * # create parameters factory_kwargs = {'device': get_current_device(), 'dtype': dtype} @@ -357,16 +357,16 @@ class LayerNorm2p5D(ParallelLayer): def forward(self, x: Tensor) -> Tensor: with torch.no_grad(): - E_x = torch.sum(x, dim=-1, keepdim=True) # [b/q, s, 1] + E_x = torch.sum(x, dim=-1, keepdim=True) # [b/q, s, 1] torch.distributed.all_reduce(E_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW)) E_x /= self.normalized_shape # Var_x in the block below is the sum of input^2 - Var_x = torch.sum(x * x, dim=-1, keepdim=True) # [b/q, s, 1] + Var_x = torch.sum(x * x, dim=-1, keepdim=True) # [b/q, s, 1] torch.distributed.all_reduce(Var_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW)) Var_x /= self.normalized_shape - Var_x = Var_x - E_x * E_x # variance of x [b/q, s, 1] + Var_x = Var_x - E_x * E_x # variance of x [b/q, s, 1] # this time 1/sqrt(Var_x + epsilon) Var_x = 1.0 / torch.sqrt(Var_x + self.variance_epsilon) @@ -589,7 +589,7 @@ class PatchEmbedding2p5D(ParallelLayer): output = F.conv2d(input_, weight, bias, stride=self.patch_size) if self.flatten: - output = output.flatten(2).transpose(1, 2) # BCHW -> BNC + output = output.flatten(2).transpose(1, 2) # BCHW -> BNC cls_token = all_gather_tensor_2p5d(self.cls_token, -1, ParallelMode.PARALLEL_2P5D_COL) pos_embed = all_gather_tensor_2p5d(self.pos_embed, -1, ParallelMode.PARALLEL_2P5D_COL) @@ -1038,7 +1038,7 @@ class Classifier2p5D(ParallelLayer): destination.update(local_state) def forward(self, input_: Tensor) -> Tensor: - out_shape = input_.shape[:-1] + (self.num_classes, ) + out_shape = input_.shape[:-1] + (self.num_classes,) return classifier_2p5d(input_, self.weight, self.bias, self.tesseract_dim, out_shape, self.row_rank, self.col_rank, ParallelMode.PARALLEL_2P5D_ROW, ParallelMode.PARALLEL_2P5D_COL, @@ -1172,7 +1172,7 @@ class VocabParallelClassifier2p5D(ParallelLayer): def forward(self, x: Tensor) -> Tensor: # input: [m/dq, n/q, k/q] # output: [m/dq, n/q, h/q] - out_shape = x.shape[:-1] + (self.hidden_size_per_partition, ) + out_shape = x.shape[:-1] + (self.hidden_size_per_partition,) output = Matmul_ABT_2p5D.apply( x,