From 8119e43593ae1f9e08c66de357e9a97af48185b8 Mon Sep 17 00:00:00 2001
From: Saaketh <narayan.saaketh@gmail.com>
Date: Wed, 4 Sep 2024 22:05:43 -0400
Subject: [PATCH 1/2] change

---
 megablocks/layers/glu.py |  4 ++--
 megablocks/layers/mlp.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/megablocks/layers/glu.py b/megablocks/layers/glu.py
index e510723..1d9c82b 100644
--- a/megablocks/layers/glu.py
+++ b/megablocks/layers/glu.py
@@ -67,7 +67,7 @@ class MemoryOptimizedGroupedGLU(torch.autograd.Function):
     """GroupedMLP with manually scheduled memory reuse."""
 
     @staticmethod
-    @torch.cuda.amp.custom_fwd
+    @torch.amp.custom_fwd(device_type='cuda')
     def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
         # Cast inputs using ctx dtype from AMP
         if ctx._fwd_used_autocast:
@@ -102,7 +102,7 @@ def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
         return dsd_out
 
     @staticmethod
-    @torch.cuda.amp.custom_bwd
+    @torch.amp.custom_bwd(device_type='cuda')
     def backward(ctx, ddsd_out):
         if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
             raise ValueError('Expected all MLP inputs to need grad.')
diff --git a/megablocks/layers/mlp.py b/megablocks/layers/mlp.py
index e8f2d7b..0959c44 100644
--- a/megablocks/layers/mlp.py
+++ b/megablocks/layers/mlp.py
@@ -18,13 +18,13 @@
 class ScaleGradient(torch.autograd.Function):
 
     @staticmethod
-    @torch.cuda.amp.custom_fwd
+    @torch.amp.custom_fwd(device_type='cuda')
     def forward(ctx: Any, x: torch.Tensor, scale: float):
         ctx.scale = scale
         return x
 
     @staticmethod
-    @torch.cuda.amp.custom_bwd
+    @torch.amp.custom_bwd(device_type='cuda')
     def backward(ctx: torch.Tensor, grad: torch.Tensor):
         return grad * ctx.scale, None
 
@@ -188,7 +188,7 @@ class MemoryOptimizedMLP(torch.autograd.Function):
     """Sparse MLP with manually scheduled memory reuse."""
 
     @staticmethod
-    @torch.cuda.amp.custom_fwd
+    @torch.amp.custom_fwd(device_type='cuda')
     def forward(ctx, x, w1, w2, topo, activation_fn):
         # Cast inputs using ctx dtype from AMP
         if ctx._fwd_used_autocast:
@@ -230,7 +230,7 @@ def forward(ctx, x, w1, w2, topo, activation_fn):
         return dsd_out
 
     @staticmethod
-    @torch.cuda.amp.custom_bwd
+    @torch.amp.custom_bwd(device_type='cuda')
     def backward(ctx, ddsd_out):
         if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
             raise ValueError('Expected all MLP inputs to need grad.')
@@ -398,7 +398,7 @@ class MemoryOptimizedGroupedMLP(torch.autograd.Function):
     """GroupedMLP with manually scheduled memory reuse."""
 
     @staticmethod
-    @torch.cuda.amp.custom_fwd
+    @torch.amp.custom_fwd(device_type='cuda')
     def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
         # Cast inputs using ctx dtype from AMP
         if ctx._fwd_used_autocast:
@@ -431,7 +431,7 @@ def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
         return dsd_out
 
     @staticmethod
-    @torch.cuda.amp.custom_bwd
+    @torch.amp.custom_bwd(device_type='cuda')
     def backward(ctx: Any, ddsd_out: torch.Tensor):
         if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
             raise ValueError('Expected all MLP inputs to need grad.')

From 7a8d2e89032e41c54f7729c337a7714f23d22c8f Mon Sep 17 00:00:00 2001
From: Saaketh <narayan.saaketh@gmail.com>
Date: Wed, 4 Sep 2024 22:13:27 -0400
Subject: [PATCH 2/2] change

---
 megablocks/layers/glu.py |  4 ++--
 megablocks/layers/mlp.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/megablocks/layers/glu.py b/megablocks/layers/glu.py
index 1d9c82b..cbe0c91 100644
--- a/megablocks/layers/glu.py
+++ b/megablocks/layers/glu.py
@@ -67,7 +67,7 @@ class MemoryOptimizedGroupedGLU(torch.autograd.Function):
     """GroupedMLP with manually scheduled memory reuse."""
 
     @staticmethod
-    @torch.amp.custom_fwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
     def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
         # Cast inputs using ctx dtype from AMP
         if ctx._fwd_used_autocast:
@@ -102,7 +102,7 @@ def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
         return dsd_out
 
     @staticmethod
-    @torch.amp.custom_bwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
     def backward(ctx, ddsd_out):
         if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
             raise ValueError('Expected all MLP inputs to need grad.')
diff --git a/megablocks/layers/mlp.py b/megablocks/layers/mlp.py
index 0959c44..6e6f4d8 100644
--- a/megablocks/layers/mlp.py
+++ b/megablocks/layers/mlp.py
@@ -18,13 +18,13 @@
 class ScaleGradient(torch.autograd.Function):
 
     @staticmethod
-    @torch.amp.custom_fwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
     def forward(ctx: Any, x: torch.Tensor, scale: float):
         ctx.scale = scale
         return x
 
     @staticmethod
-    @torch.amp.custom_bwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
     def backward(ctx: torch.Tensor, grad: torch.Tensor):
         return grad * ctx.scale, None
 
@@ -188,7 +188,7 @@ class MemoryOptimizedMLP(torch.autograd.Function):
     """Sparse MLP with manually scheduled memory reuse."""
 
     @staticmethod
-    @torch.amp.custom_fwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
     def forward(ctx, x, w1, w2, topo, activation_fn):
         # Cast inputs using ctx dtype from AMP
         if ctx._fwd_used_autocast:
@@ -230,7 +230,7 @@ def forward(ctx, x, w1, w2, topo, activation_fn):
         return dsd_out
 
     @staticmethod
-    @torch.amp.custom_bwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
     def backward(ctx, ddsd_out):
         if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
             raise ValueError('Expected all MLP inputs to need grad.')
@@ -398,7 +398,7 @@ class MemoryOptimizedGroupedMLP(torch.autograd.Function):
     """GroupedMLP with manually scheduled memory reuse."""
 
     @staticmethod
-    @torch.amp.custom_fwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
     def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
         # Cast inputs using ctx dtype from AMP
         if ctx._fwd_used_autocast:
@@ -431,7 +431,7 @@ def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
         return dsd_out
 
     @staticmethod
-    @torch.amp.custom_bwd(device_type='cuda')
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
     def backward(ctx: Any, ddsd_out: torch.Tensor):
         if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
             raise ValueError('Expected all MLP inputs to need grad.')