diff --git a/megablocks/layers/moe.py b/megablocks/layers/moe.py index 94af93a8..a23ada23 100644 --- a/megablocks/layers/moe.py +++ b/megablocks/layers/moe.py @@ -145,7 +145,7 @@ def load_balancing_loss(self, tokens_per_expert, expert_scores): assert num_experts == self.num_experts scale = self.num_experts / (tokens * self.args.moe_top_k) return scale * torch.dot( - tokens_per_expert.half(), + tokens_per_expert.to(expert_scores.dtype), expert_scores.mean(dim=0)) def indices_and_bins(self, top_expert):