[shardformer] fix linear 1d row and support uneven splits for fused qkv linear (#6084)

* [tp] hotfix linear row * [tp] support uneven split for fused linear * [tp] support sp for fused linear * [tp] fix gpt2 mlp policy * [tp] fix gather fused and add fused linear row
2025-09-09 04:50:17 +00:00 · 2024-10-10 14:34:45 +08:00
parent f4daf04270
commit 646b3c5a90
10 changed files with 399 additions and 157 deletions
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -428,11 +428,8 @@ class Linear1D_Row(ParallelModule):
                    handle.wait()
                output = torch.cat(output_parallel_list, dim=-1)
        else:
-            if self.seq_parallel_mode is None:
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
-                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
-            elif self.seq_parallel_mode == "split_gather":
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
+            if self.seq_parallel_mode == "split_gather":
+                output_parallel = F.linear(input_, self.weight)
                output = reducescatter_forward_gather_backward(
                    output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
                )
@@ -445,8 +442,8 @@ class Linear1D_Row(ParallelModule):
                    ring=True,
                )
            else:
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
-                output = reduce_forward(output_parallel, self.process_group)
+                output_parallel = F.linear(input_, self.weight)
+                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)

        if not self.skip_bias_add:
            if self.bias is not None: