tenstorrent · jerrysky3 · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/tests/lowering/embedding/test_embedding.py b/tests/lowering/embedding/test_embedding.py
@@ -70,3 +70,35 @@ def test_embedding_tile_layout(device, batch_size, sentence_size, vocabulary_siz
     assert [node.target for node in nodes].count(ttnn.embedding) == 1
     # Check inference result
     assert torch.allclose(result_before, result_after)
+
+
+@pytest.mark.parametrize(
+    "batch, sentence_size, vocabulary_size, hidden_embedding_dim",
+    [
+        (1, 384, 160, 1024),
+        (8, 384, 256, 512),
+        pytest.param(8, 384, 512, 1024, marks=pytest.mark.xfail(reason="Not support vocabulary size > 256 (#248)")),
+    ],
+)
+def test_embedding_backward_tile_layout(device, batch, sentence_size, vocabulary_size, hidden_embedding_dim):
+    m = EmbeddingTileLayoutModule()
+    input = torch.randint(0, vocabulary_size, (batch, sentence_size), dtype=torch.int64)
+    weights = torch.rand((vocabulary_size, hidden_embedding_dim), dtype=torch.bfloat16)
+    grad_data = torch.rand((batch, sentence_size, hidden_embedding_dim))
+
+    weights_before = weights.clone().detach().requires_grad_(True)
+    m.forward(input, weights_before).backward(gradient=grad_data)
+
+    option = torch_ttnn.TorchTtnnOption(device=device, gen_graphviz=True)
+    # The compilation is lazy, so we need to run forward once to trigger the compilation
+    m = torch.compile(m, backend=torch_ttnn.backend, options=option)
+    weights_after = weights.clone().detach().requires_grad_(True)
+    m.forward(input, weights_after).backward(gradient=grad_data)
+
+    # Check the graph has be rewritten
+    nodes = list(option._out_fx_graphs[-1].nodes)
+    assert [node.target for node in nodes].count(ttnn.embedding_bw) == 1
+    # Check inference result
+    assert weights_before.grad.shape == weights_after.grad.shape
+    # Multiple float multiplications needs a higher tolerance
+    assert torch.allclose(weights_before.grad, weights_after.grad, rtol=0.1)
diff --git a/torch_ttnn/passes/lowering/add_data_move_pass.py b/torch_ttnn/passes/lowering/add_data_move_pass.py
@@ -153,9 +153,11 @@ def is_tt_compute(node) -> bool:
         + TTNN_NORM_OPS
         + [
             ttnn.embedding,
+            ttnn.embedding_bw,
             ttnn.ones,
             ttnn.tril,
             ttnn.arange,
+            ttnn.zeros,
             ttnn.zeros_like,
             ttnn.mean,
             ttnn.global_avg_pool2d,

diff --git a/torch_ttnn/passes/lowering/to_tt_pass.py b/torch_ttnn/passes/lowering/to_tt_pass.py
@@ -8,6 +8,7 @@
     TtnnDramMemoryConfig,
     TtnnRowMajorLayout,
 )
+import math
 import numpy as np
 from typing import Tuple
 import torch_ttnn.metrics as metrics
@@ -623,6 +624,26 @@ def rewrite_node(node):
                     input = g.call_function(ttnn.to_layout, args=(input, TtnnRowMajorLayout()))
                 return g.call_function(ttnn.pad, args=(input, full_pad, value))
 
+            if node.target == torch.ops.aten.embedding_dense_backward.default:
+                grad_output, indices, num_weights, padding_idx, scale_grad_by_freq = args
+                # TODO(#248): Not support padding_idx and scale_grad_by_freq
+                if padding_idx != -1 or scale_grad_by_freq:
+                    return None
+                if num_weights > 256:
+                    return None
+                # Change indices to row-major layout to support non-tile-aligned shape
+                indices = g.call_function(ttnn.to_layout, args=(indices, TtnnRowMajorLayout()))
+                grad_shape = grad_output.meta["val"].size()
+                embedding_dim = grad_shape[-1]
+                # Reconstruct a fake weight tensor solely for vocabulary size (dim 0)
+                weights = g.call_function(ttnn.zeros, args=((num_weights, 32),), kwargs={"device": TtnnDevice()})
+                # Pack grad_output into (1, 1, x, embedding dim)
+                new_grad_shape = (1, 1, math.prod(grad_shape[:-1]), embedding_dim)
+                grad_output = g.call_function(ttnn.reshape, args=(grad_output, new_grad_shape))
+
+                result = g.call_function(ttnn.embedding_bw, args=(indices, weights, grad_output))
+                return g.call_function(ttnn.reshape, args=(result, node.meta["val"].size()))
+
         with g.inserting_before(node):
             new_node = rewrite_node(node)
             if new_node is not None: