Added moe parallel example (#140)

2025-09-06 11:32:10 +00:00 · 2022-01-17 15:34:04 +08:00
parent f68eddfb3d
commit 1ff5be36c2
4 changed files with 168 additions and 3 deletions
--- a/model_zoo/moe/models.py
+++ b/model_zoo/moe/models.py
@@ -14,6 +14,7 @@ from colossalai.utils import get_current_device
 class VanillaSelfAttention(nn.Module):
    """Standard ViT self attention.
    """
+
    def __init__(self,
                 d_model: int,
                 n_heads: int,
@@ -57,6 +58,7 @@ class VanillaSelfAttention(nn.Module):
 class VanillaFFN(nn.Module):
    """FFN composed with two linear layers, also called MLP.
    """
+
    def __init__(self,
                 d_model: int,
                 d_ff: int,
@@ -72,8 +74,7 @@ class VanillaFFN(nn.Module):
        drop1 = nn.Dropout(drop_rate) if dropout1 is None else dropout1
        drop2 = nn.Dropout(drop_rate) if dropout2 is None else dropout2

-        self.ffn = nn.Sequential(
-            dense1, act, drop1,dense2, drop2)
+        self.ffn = nn.Sequential(dense1, act, drop1, dense2, drop2)

    def forward(self, x):
        return self.ffn(x)
@@ -91,7 +92,7 @@ class Widenet(nn.Module):
                 d_model: int = 768,
                 num_heads: int = 12,
                 d_kv: int = 64,
-                 d_ff: int = 3072,
+                 d_ff: int = 4096,
                 attention_drop: float = 0.,
                 drop_rate: float = 0.1,
                 drop_path: float = 0.):