Add zloss tests

databricks · Sep 9, 2024 · 1526174 · 1526174
1 parent 3ad64e2
commit 1526174
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 5 deletions.
diff --git a/tests/layers/dmoe_test.py b/tests/layers/dmoe_test.py
@@ -10,6 +10,7 @@
 from megablocks.layers.arguments import Arguments
 from megablocks.layers.dmoe import dMoE
 from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
+from megablocks.layers.router import batched_router_zloss, clear_router_zloss
 from tests.layers.architectures import FFN
 
 # min size: (1, 2, 128, 2, 1)
@@ -50,6 +51,7 @@ def construct_moes(
     moe_capacity_factor: int = 1,
     moe_top_k: int = 1,
     mlp_impl: str = 'sparse',
+    moe_zloss_weight: float = 0,
 ):
     init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
     args = Arguments(
@@ -64,6 +66,7 @@ def construct_moes(
         mlp_impl=mlp_impl,
         fp16=False,
         bf16=True,
+        moe_zloss_weight=moe_zloss_weight,
     )
 
     mlp = FFN(args)
@@ -142,6 +145,39 @@ def test_dmoe_forward_backward(
     clear_load_balancing_loss()
 
 
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k', 'mlp_impl'), _FORWARD_TESTS)
+def test_dmoe_forward_backward_with_zloss(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+    mlp_impl: str,
+):
+    x = torch.randn(sl, bs, hs).to(torch.bfloat16).cuda()
+    x.requires_grad_(True)
+
+    args, _, _, layer = construct_moes(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        mlp_impl=mlp_impl,
+        moe_zloss_weight=1e-3,
+    )
+
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    loss = out.sum() + batched_load_balancing_loss(args) + batched_router_zloss(args)
+    loss.backward()
+    assert x.grad is not None
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+    clear_router_zloss()
+
+
 @pytest.mark.gpu
 @pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
 def test_dmoe_forward_vs_baseline(

diff --git a/tests/layers/moe_test.py b/tests/layers/moe_test.py
@@ -8,6 +8,7 @@
 
 from megablocks.layers.arguments import Arguments
 from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
+from megablocks.layers.router import batched_router_zloss, clear_router_zloss
 from tests.layers.architectures import FFN
 
 _FORWARD_TESTS = (
@@ -33,11 +34,12 @@
 
 
 def construct_moe(
-    hidden_size,
-    ffn_hidden_size,
-    moe_num_experts=1,
-    moe_capacity_factor=1,
-    moe_top_k=1,
+    hidden_size: int,
+    ffn_hidden_size: int,
+    moe_num_experts: int = 1,
+    moe_capacity_factor: int = 1,
+    moe_top_k: int = 1,
+    moe_zloss_weight: float = 0,
 ):
     init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
     args = Arguments(
@@ -47,6 +49,7 @@ def construct_moe(
         moe_capacity_factor=moe_capacity_factor,
         moe_top_k=moe_top_k,
         init_method=init_method,
+        moe_zloss_weight=moe_zloss_weight,
     )
 
     mlp = FFN(args)
@@ -109,6 +112,37 @@ def test_moe_forward_backward(
     clear_load_balancing_loss()
 
 
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward_backward_with_zloss(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+
+    args, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_zloss_weight=1e-3,
+    )
+
+    out, _ = layer(x)
+    assert out.shape == x.shape
+
+    loss = out.sum() + batched_load_balancing_loss(args)
+    loss.backward()
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+    clear_router_zloss()
+
+
 @pytest.mark.gpu
 @pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
 def test_moe_forward_vs_dense(bs: int, sl: int, hs: int):