diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py index 957685c03..71e96c5b0 100644 --- a/colossalai/shardformer/layer/attn.py +++ b/colossalai/shardformer/layer/attn.py @@ -410,7 +410,7 @@ class RingAttention(torch.autograd.Function): We also adopt the double ring topology from LoongTrain to fully utilize available NICs on each node, by computing attention within a inner ring first and then sending all KVs to the next ring at once. - Our implementation references + Our implementation references code from - ring-flash-attention: https://github.com/zhuzilin/ring-flash-attention/tree/main - Megatron Context Parallel: https://github.com/NVIDIA/TransformerEngine/pull/726 References: