From 35f45ffd369c1dffb2ad0e1961eaba4ce8fbb1b9 Mon Sep 17 00:00:00 2001
From: Edenzzzz <wtan45@wisc.edu>
Date: Tue, 6 May 2025 14:14:22 -0500
Subject: [PATCH] fix

---
 colossalai/shardformer/layer/attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 957685c03..71e96c5b0 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -410,7 +410,7 @@ class RingAttention(torch.autograd.Function):
     We also adopt the double ring topology from LoongTrain to fully utilize available
     NICs on each node, by computing attention within a inner ring first and then sending all KVs to the next
     ring at once.
-    Our implementation references
+    Our implementation references code from
     - ring-flash-attention: https://github.com/zhuzilin/ring-flash-attention/tree/main
     - Megatron Context Parallel: https://github.com/NVIDIA/TransformerEngine/pull/726
     References: