fanshiqing · littsk · Jul 18, 2024
diff --git a/grouped_gemm/ops.py b/grouped_gemm/ops.py
@@ -69,6 +69,8 @@ def forward(ctx,
       nvtx.range_push("permute_topK forward")
     # Empty input check
     if not input_act.numel():
+      if ENABLE_NVTX:
+        nvtx.range_pop()
       return input_act, None
 
     # For top1 case, view the indices as 2D tensor to unify the shape for topk>=2 cases.
@@ -133,6 +135,8 @@ def backward(ctx, permuted_act_grad, _):
       nvtx.range_push("permute_topK backward")
     # Empty input check
     if not permuted_act_grad.numel():
+      if ENABLE_NVTX:
+        nvtx.range_pop()
       return permuted_act_grad, None, None, None
 
     if not permuted_act_grad.is_contiguous():
@@ -170,6 +174,8 @@ def forward(ctx,
     # Empty input check
     if not input_act.numel():
       ctx.probs = probs
+      if ENABLE_NVTX:
+        nvtx.range_pop()
       return input_act
 
     # Device check
@@ -229,6 +235,8 @@ def backward(ctx, unpermuted_act_grad):
       nvtx.range_push("unpermute_topK backward")
     # Empty input check
     if not unpermuted_act_grad.numel():
+      if ENABLE_NVTX:
+        nvtx.range_pop()
       return unpermuted_act_grad, None, ctx.probs
 
     if not unpermuted_act_grad.is_contiguous():
@@ -255,4 +263,4 @@ def permute(input_act, indices, num_out_tokens=None, max_token_num=0):
   return PermuteMoE_topK.apply(input_act, indices, num_out_tokens, max_token_num)
 
 def unpermute(input_act, row_id_map, probs=None):
-  return UnpermuteMoE_topK.apply(input_act, row_id_map, probs)
+  return UnpermuteMoE_topK.apply(input_act, row_id_map, probs)