From 35f45ffd369c1dffb2ad0e1961eaba4ce8fbb1b9 Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Tue, 6 May 2025 14:14:22 -0500 Subject: [PATCH] fix --- colossalai/shardformer/layer/attn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py index 957685c03..71e96c5b0 100644 --- a/colossalai/shardformer/layer/attn.py +++ b/colossalai/shardformer/layer/attn.py @@ -410,7 +410,7 @@ class RingAttention(torch.autograd.Function): We also adopt the double ring topology from LoongTrain to fully utilize available NICs on each node, by computing attention within a inner ring first and then sending all KVs to the next ring at once. - Our implementation references + Our implementation references code from - ring-flash-attention: https://github.com/zhuzilin/ring-flash-attention/tree/main - Megatron Context Parallel: https://github.com/NVIDIA/TransformerEngine/pull/726 References: