pytorch · wonjoolee95 · May 3, 2024 · May 2, 2024 · May 2, 2024 · alanwaketan
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -590,18 +590,19 @@ def test_paged_attention_wrapper_with_dynamo(self):
 
     def paged_attention_wrapper(q, k, v, seq_lens, page_indices,
                                 pages_per_compute_block):
-      return paged_attention(
-          q_xla,
-          k_pages_xla,
-          v_pages_xla,
-          seq_lens_xla,
-          page_indices_xla,
-          pages_per_compute_block=block_size // page_size,
+      return torch.ops.xla.paged_attention(
+          q,
+          k,
+          v,
+          seq_lens,
+          page_indices,
+          pages_per_compute_block=pages_per_compute_block,
       )
 
     compiled_paged_attention = torch.compile(
         paged_attention_wrapper, backend="openxla")
-    output = paged_attention_wrapper(
+
+    output = compiled_paged_attention(
         q_xla,
         k_pages_xla,
         v_pages_xla,

diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -490,11 +490,8 @@ def non_xla_attetion(q, k, v, attention_type):
         f'XLA {attention_type} attention should only be applied to tensors on XLA device'
     )
 
-  # perform a regular attention if input tensors are not on XLA device.
-  attn_weight = q @ k.transpose(-2, -1)
-  attn_weight = torch.nn.functional.softmax(attn_weight, dim=-1)
-  attn_output = attn_weight @ v
-  return attn_output
+  # Return orignal shape of q.
+  return torch.empty_like(q)
 
 
 XLA_LIB.define(
@@ -537,4 +534,4 @@ def paged_attention_non_xla(q: torch.Tensor, k_pages: torch.Tensor,
                             v_pages: torch.Tensor, lengths: torch.Tensor,
                             page_indices: torch.Tensor,
                             pages_per_compute_block: int):
-  return non_xla_attetion(q, k, v, "paged")
+  return non_xla_attetion(q, k_pages, v_pages, "paged")