pytorch · bhavya01 · Jun 7, 2024 · Oct 4, 2023 · Mar 20, 2024 · Mar 20, 2024
diff --git a/bazel/rules_def.bzl b/bazel/rules_def.bzl
@@ -4,6 +4,7 @@ load(
     "@xla//xla:xla.bzl",
     "xla_cc_test",
 )
+load("@rules_cc//cc:defs.bzl", _cc_proto_library = "cc_proto_library")
 
 def ptxla_cc_library(
         deps = [],
@@ -38,3 +39,4 @@ def ptxla_cc_test(
         ],
         **kwargs
     )
+cc_proto_library = _cc_proto_library
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -1970,6 +1970,32 @@ def foo(x):
     for dtype in test_dtypes:
       test(dtype)
 
+  @unittest.skipIf(xr.device_type() != 'CUDA', "This test only works on GPU.")
+  def test_gpu_custom_call_triton(self):
+    # This payload is generated by the following Triton code:
+    # import triton
+    # import triton.language as tl
+    # @triton.jit
+    # def add_kernel(x_ptr, y_ptr, length, output_ptr, block_size: tl.constexpr,
+    # ):
+    #   """Adds two vectors."""
+    #   pid = tl.program_id(axis=0)
+    #   block_start = pid * block_size
+    #   offsets = block_start + tl.arange(0, block_size)
+    #   mask = offsets < length
+    #   x = tl.load(x_ptr + offsets, mask=mask)
+    #   y = tl.load(y_ptr + offsets, mask=mask)
+    #   output = x + y
+    #   tl.store(output_ptr + offsets, output, mask=mask)
+    payload = b'x\x9c\xcdW]o\xdb6\x14E\x8b=\xcc|\n\xb6\x87\xf5\x91\xebf \xce\\\x99\xa4>,\xb9iP\x0c\x18\xf6\xb0\xae\xd8CQ\x0c\x18\nA\xb6\x98D\xa8,i\x12\x9d%3\xfc?\xf6#\xf6\xbe_\xb1\xff\xb4KR\x92E\xc5v\xd2`\x05\n\x18\xb2uy\xee9\xf7\x1e\x92\x12\x8d\xfe9A\x7f\x9d\xa0/\xa38\x0e\xdf\xf32\xe3iHb\x1a3n\xc7G\x9f=\xfd\xf7\xab\xc9\x04M&\xf8G\x9e\xf12\x12<\xc6\xf3\x1b\xfc\xea\xd5\xdb\x9f\xf1\xeb\xb7\xbf\xbc\xf9\x15\x7f\x1f-\xde?\xfb!\x8b\x01\x83\x90u\xc5\xcb*\xc93\xec[\x0cY"*/\xb8\xc0\xd52\x9c\xba\xc8\x02\xfa\x92WUX%\x7fr\xec9\x08\r\x80\xd5\xbaH\xf3y:\xd8!-\xc9\x92*\x99\xa7\x1c[<\x13\xe5\r\xde\x01:F\x03\xab\x88\xcah\x89\xad\x95\xe7\xecB\x84j8$\xe3\xfb"i\x17i\xb3\x03HvoN\x1b\x8d\x90\xb5\x8c\xae3\x91\xc4\x982\x7f\x8c)|\xd0\x1a\xf2K~\x81\xad\xa2\x04[\x07\xc3\xe2\xd4={\xde\xc4\xe6\xa0=\x18\x96\xa7\x94vb\xa0\x02\xb1\xf8\xd4W\xb14_\x0c(\xf60A\xdf\xbe\n\xc3\xf3U\xb6\x08\xe7\xfc"\xc9\xc8\xcc\x18D\x834\xd6u\xaa2%\x813\xc6\xbf\x1d0\xeb\xdd\xf3\x1d9\xee\xc1\x1c\n9\xb2\n\xb1,:\xf2\xd4\xc3v\x80\x06\xcb\xfcJy\t4\xc02\x04\x17\xack\x90\x88\xb2X\xb5)\xe3\x1e\xc4\xe5\xe0\xd4\x90\xd69\xd3\x83\xcalg\xb5\xde\xc1\x1c\xfb\xdd\xd6?\xea`\xc6\xb65\x0eK\x98\x9b\xe1BDu\x91\r\xcaU\xa8\xea2mf\xc6\x1fk\xac\xfd\xbc\xdb.\xf3\xd0 /\xdb\xb6\x02\tRH\xaf\x03\x9bb\n\xaeT\\\x14V*\xacJ\xf1\x15RV\xe3\xa7\x1d\xa8\xaf\x8b[\xa5\xd6\x1fI\xcckl\x19Ok\xb0#m\x8cc\xab\xaa\xdbV$rz%\xc6\xa0\xa1N\xb7G6\xc6\xe4\x9a\x00\xe0%\x08cpO\xee\xc3H\xf7\xb6\x96\xe3x\x03\x06J\x16\x8a\xbf\xc3\x04w\xfd\nTI]U\xa6\xf4\xdc[\xaaAO\xd5\xbeC\xd5nUY_\x95\x11\x98\x82ZU{\xa0\x9a\xb4\xe5\x85u`\xf0q\xcc\xe2\x14$\xf6\xfa\xc5\x01\xd2\xb6\xcd5HI\xbd\x08)\x93E\xaa\xf9\xe1\xbf7\xf3\xa3\xf4$\x84\xd4KW\xedZ9"\x15\xd4pAuo6\xaeD\xb77\xd5\x92\xad[\x1a\xabN\x1d\xbc1\x8bF\x83\x92\x8bv\x0b\xd1\xd9vK\xf3,\x86\x1d\x856\x00?OR\x0e\xf8\xa7\x93\xcb|\xc9\'\xf3yt\x99N\x04\x07\xad\xe2\xe6)\x0cW|!\xe0\xe1;\xb0b>_]\x84\xd1|^\xf2+4X#k\x0e\xf3\xaf\xaf\xd3\xce\x8d\xado|\x1d\n\xd4\x97\xabG\xba\x03\x9e\xfa\xd2Wf\xa4\xf8D\x0f\xe91\xcanKP\xbfsC:W\x8du\xbcNh\x7f"\xbc-\xd4M\x8d\xb3\xddn\xd4\xbfU\xb1[\xe7\xead7\xe8\xdeyv\xb7Vb\\\x07\x9b\xdb&&\xd9y\xae-\x84y\xa4\x8cv\x8a\':h\xba\xddq\xa7\x96\xad\xcd\xa1M\x0fn/\xde\xe0n\xbbCL$i\x90&\x83\xd3\x10\xd5\xf63\xba\xa3\xba4\xc9\xb8F\xd7&\x13\xc7\x94\'\x81!R\xc3\x02\xdf\xf8\xea\xe5\x12\xbf[&R\xaf\xa7\xde\x9b\xa8\x17\x94k\xb9m\xf0\xfe\xe8\xbaF\xd73\xcb \xc6wP\xdbB\xa6\xc8\xb4\xab\xe9\x94\x98\xf1\xba\xfa:\xcd\xf1\r2\'0n]3\xd7\xa5\xc6(\xf9\x94\xcb\xa2\x9d\xdd\xdb\xac\x8c]\x0b\xbdX\xcd\xb3h\xc9\xabv\xb1\xcbi\x80\xe0k\x19TS\xf1\xcc\x88Tp\xba\x13\xfa\xe4\xd1\x8b\xcd\xf6\xee\x11\xb5\x99\xea\x9d\xe4\xea\x1f\x1e\xfbt\xcd\x83\xf2z\r\xeag\xf1\x1e\xfb\xc4Mq\xdb\xbe72h\xd8\xa7#\xa6}\xdd\xd8\x07\xd9\xd7c\xd8_\x9f|\xd3\xaca\xe0\xe4\xef/\xbe\x81\xdf\xf8\x05\x86\xeb\xf1\xce7\xc9\xcc\x9b\x91\x11Z\xe6\xf1\n\xce\xdek\x84\xb1\x10\x96\xdc\x8f\x18d\xd2d\x81_\xee:\x88\x0f\xe1\xacOf\xf8k\xc0\x16\xa2<Ml8\x13\xd03\xbc\x86\xfb8Q\x07\xf9$M\xc4\r\xe8\xc2!i\x86a|sG\x05\xf0>\x05N\xfa\x118\x99\xc2*\x1e8\x94\x87=.\xffC\xa8\xec\xff\xb5\xbc\x11\x8e\x84(\x93\xf9\n\xa2x\x9d\xe5I&\x9f\xdb@p\x1e\xa5\x15\xdf\xa8\xc9\xc0xH \x02"\xf0\xdf*,\xca\xfcB\x1eh\xe1\x7f\xc5\xb5\xd6P\x12r\x8a\xe9H\xa3\x17~(\xc3/pT&\xe2\xd2Z\xe4\x19\xac\xb3L4}\xb6xV\xe3i\x0b\x85Sg\x02j\xe3\x96cw\x02\xd3\xe5,\xa3\xf7<,\xa3\xec\x02\x16\r\xac\xc3\xad\x93c\xac\x166\x04Hc\xc3\x0c\x0b\x9eUyy\xea_\xc3\xfdYKi\xd7\x94\xb6\xa6\xac\x8a4\x12\xb2\xa2\x19>\x06\xdc\x08?;\xdb\x93\xe8\xd4\x89N[<,R(^\x1e\xcd\xd8^\xb9&\xcb5\xe4\xe4\n\xb9[\xd1\xads\xbd\xad\xb3\xcb"\xc1U*@R\x1e\x06\xdd\xbd\xb2M\xea\xb4/+\xfd9\xee\xad\'\xb3\x84\xde\xe0\x96\xd3\xab9}\xcd\t\xdd\x03\x0c$d1\xddB\xfa\x04\xe3=56|\x81\xe6K\xf3(\x06\xf2\xb1\xecw\xbd\x88\x16\x97rU\xd2f~\xf9U\xb2\x10\xdd@R\xbd\xcd\xa1)8\xaen\x17\xef>7\xa6\xcd\xba#};\xe8\xc3\xed\xf0\x1bRj\xfa\xa1\x8e\xf8\x0f2\xa4ed]G(\xfd\x18\x96\x04\x8d\x96\xdd[\xce\xf2\xef!\xdd\xbf\x9e)i\x12\x9d\xbe\x97\xf6\xc3\xbd\xa4\xcd\x93\x84\xba=3\x9d\x87\x9a\xd9P\xca\x12E^r\xc9-[\xb3\xef\xe7\xe6~\xe7(k\x89\xe1\xff\xd4\xaa\xcc\xb6#\xea\xd9\xb2i\xefG\xa8\xfb[!\x0e\xbe\x14\xa93cL\x03\xd9a\xa0\xdb\x02\xed\xc3@of\x07\x1a\xe8\xdc\x05d\x9e\x06\xba\x87\x81\xd3\x19\xad\x19\xbd\xc3@\xbf\xadqz\x17\x90:\x1a\xe8\x1f\x06\x06-cp\x17\xb0aT;~?\x92\x11\xf0\xb2F\x1e\x9e\x1bFg\xac\xe1<<9\x80\xb4\xed\x1ayxv$\xe7\x08\x91\x9f\x8e\x1e=y\x84\x1f\x9d|\x86\x1e\x1f\x1d\xd5\xd7\xc7O>\xd7\xbf\xfe\x03\x97\x03\xa4b'
+    x = torch.arange(8, dtype=torch.int).to("xla")
+    y = torch.arange(8, dtype=torch.int).to("xla")
+    expected_output = x + y
+    output = torch.arange(8, dtype=torch.int).to("xla")
+
+    torch_xla._XLAC._xla_gpu_custom_call_(output, [x, y], payload)
+    self.assertTrue(torch.allclose(output.cpu(), expected_output.cpu()))
+
 
 class MNISTComparator(nn.Module):
 

diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD
@@ -267,6 +267,7 @@ ptxla_cc_library(
         ":dtype",
         ":tensor",
         ":version",
+        "//torch_xla/csrc/triton:triton_kernels",
         "//torch_xla/csrc/runtime",
         "//torch_xla/csrc/runtime:metrics",
         "//torch_xla/csrc/runtime:metrics_analysis",

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -2235,6 +2235,13 @@ void InitXlaModuleBindings(py::module m) {
           return tensor_methods::tpu_custom_call_(
               x_output, bridge::GetXlaTensors(inputs), payload);
         });
+  m.def("_xla_gpu_custom_call_",
+        [](at::Tensor& output, const std::vector<at::Tensor>& inputs,
+           const std::string& payload) {
+          auto x_output = bridge::GetXlaTensor(output);
+          return tensor_methods::gpu_custom_call_(
+              x_output, bridge::GetXlaTensors(inputs), payload);
+        });
   m.def("_set_xla_custom_op_name_prefix",
         [](const at::Tensor& input, const std::string& op_name_prefix,
            size_t max_call_stack_depth) -> bool {

diff --git a/torch_xla/csrc/ops/gpu_custom_call.cpp b/torch_xla/csrc/ops/gpu_custom_call.cpp
@@ -0,0 +1,36 @@
+#include "torch_xla/csrc/ops/gpu_custom_call.h"
+
+#include "torch_xla/csrc/lowering_context.h"
+#include "torch_xla/csrc/ops/xla_ops.h"
+#include "torch_xla/csrc/xla_lower_util.h"
+
+namespace torch_xla {
+
+GpuCustomCall::GpuCustomCall(torch::lazy::OpList inputs,
+                             xla::Shape output_shape,
+                             const std::string& payload)
+    : XlaNode(xla_gpu_custom_call, inputs, std::move(output_shape),
+              /*num_outputs=*/1, torch::lazy::MHash(payload)),
+      payload_(payload) {}
+
+torch::lazy::NodePtr GpuCustomCall::Clone(torch::lazy::OpList operands) const {
+  return torch::lazy::MakeNode<GpuCustomCall>(operands, xla_shape(), payload_);
+}
+
+XlaOpVector GpuCustomCall::Lower(LoweringContext* loctx) const {
+  std::vector<xla::XlaOp> inputs;
+  inputs.reserve(operands().size());
+  for (auto& operand : operands()) {
+    inputs.push_back(loctx->GetOutputOp(operand));
+  }
+  xla::XlaOp output = BuildGpuCustomCall(inputs, xla_shape(), payload_);
+  return ReturnOp(output, loctx);
+}
+
+std::string GpuCustomCall::ToString() const {
+  std::stringstream ss;
+  ss << XlaNode::ToString() << ", " << payload_;
+  return ss.str();
+}
+
+}  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/gpu_custom_call.h b/torch_xla/csrc/ops/gpu_custom_call.h
@@ -0,0 +1,26 @@
+#ifndef XLA_TORCH_XLA_CSRC_OPS_GPU_CUSTOM_CALL_H_
+#define XLA_TORCH_XLA_CSRC_OPS_GPU_CUSTOM_CALL_H_
+
+#include "torch_xla/csrc/ir.h"
+
+namespace torch_xla {
+// TODO: Merge GPU and TPU custom call.
+class GpuCustomCall : public XlaNode {
+ public:
+  // Make a GPU custom call with payload, e.g., Triton.
+  GpuCustomCall(torch::lazy::OpList inputs, xla::Shape output_shape,
+                const std::string& payload);
+
+  torch::lazy::NodePtr Clone(torch::lazy::OpList operands) const override;
+
+  XlaOpVector Lower(LoweringContext* loctx) const override;
+
+  std::string ToString() const override;
+
+ private:
+  std::string payload_;
+};
+
+}  // namespace torch_xla
+
+#endif  // XLA_TORCH_XLA_CSRC_OPS_GPU_CUSTOM_CALL_H_
diff --git a/torch_xla/csrc/ops/xla_ops.cpp b/torch_xla/csrc/ops/xla_ops.cpp
@@ -34,5 +34,6 @@ const OpKindWrapper xla_unselect("xla::unselect");
 const OpKindWrapper xla_update_slice("xla::update_slice");
 const OpKindWrapper xla_custom_sharding("xla::custom_sharding");
 const OpKindWrapper xla_tpu_custom_call("xla::tpu_custom_call");
+const OpKindWrapper xla_gpu_custom_call("xla::gpu_custom_call");
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/xla_ops.h b/torch_xla/csrc/ops/xla_ops.h
@@ -59,6 +59,7 @@ extern const OpKindWrapper xla_unselect;
 extern const OpKindWrapper xla_update_slice;
 extern const OpKindWrapper xla_custom_sharding;
 extern const OpKindWrapper xla_tpu_custom_call;
+extern const OpKindWrapper xla_gpu_custom_call;
 
 }  // namespace torch_xla
 

diff --git a/torch_xla/csrc/tensor_methods.cpp b/torch_xla/csrc/tensor_methods.cpp
@@ -55,6 +55,7 @@
 #include "torch_xla/csrc/ops/generic.h"
 #include "torch_xla/csrc/ops/generic_slice.h"
 #include "torch_xla/csrc/ops/get_dimensions_size.h"
+#include "torch_xla/csrc/ops/gpu_custom_call.h"
 #include "torch_xla/csrc/ops/hardtanh_backward.h"
 #include "torch_xla/csrc/ops/index_ops.h"
 #include "torch_xla/csrc/ops/index_select.h"
@@ -526,6 +527,17 @@ void custom_sharding_(
   input->SetShardingSpec(*sharding_spec);
 }
 
+void gpu_custom_call_(XLATensorPtr& output,
+                      const std::vector<XLATensorPtr>& inputs,
+                      const std::string& payload) {
+  std::vector<torch::lazy::Value> values;
+  for (const auto& input : inputs) {
+    values.push_back(input->GetIrValue());
+  }
+  output->SetInPlaceIrValue(torch::lazy::MakeNode<GpuCustomCall>(
+      values, output->shape().get(), payload));
+}
+
 void tpu_custom_call_(XLATensorPtr& output,
                       const std::vector<XLATensorPtr>& inputs,
                       const std::string& payload) {

diff --git a/torch_xla/csrc/tensor_methods.h b/torch_xla/csrc/tensor_methods.h
@@ -82,6 +82,10 @@ std::pair<XLATensorPtr, torch::lazy::Value> collective_permute(
 void custom_sharding_(const XLATensorPtr& input,
                       const std::shared_ptr<XLATensor::ShardingSpec>& spec);
 
+void gpu_custom_call_(XLATensorPtr& output,
+                      const std::vector<XLATensorPtr>& inputs,
+                      const std::string& payload);
+
 void tpu_custom_call_(XLATensorPtr& output,
                       const std::vector<XLATensorPtr>& inputs,
                       const std::string& payload);

diff --git a/torch_xla/csrc/triton/BUILD b/torch_xla/csrc/triton/BUILD
@@ -0,0 +1,93 @@
+load("//bazel:rules_def.bzl","cc_proto_library",)
+
+cc_library(
+    name = "cuda_vendor",
+    hdrs = [
+        "gpu_vendor.h",
+    ],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudnn_header",
+    ],
+)
+
+proto_library(
+    name = "triton_proto",
+    srcs = ["triton.proto"],
+)
+
+cc_proto_library(
+    name = "triton_cc_proto",
+    deps = [":triton_proto"],
+)
+
+cc_library(
+    name = "cuda_gpu_kernel_helpers",
+    srcs = [
+        "gpu_kernel_helpers.cpp",
+    ],
+    hdrs = [
+        "gpu_kernel_helpers.h",
+    ],
+    copts = [
+        "-fexceptions",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        ":cuda_vendor",
+        "@xla//xla/tsl/cuda:cupti",
+        "@xla//xla/tsl/cuda:cusolver",
+        "@xla//xla/tsl/cuda:cusparse",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_config_cuda//cuda:cublas_headers",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "triton_utils",
+    srcs = ["triton_utils.cpp"],
+    hdrs = ["triton_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cuda_gpu_kernel_helpers",
+        ":cuda_vendor",
+        ":triton_cc_proto",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@zlib",
+    ],
+)
+
+cc_library(
+    name = "triton_kernels",
+    srcs = ["triton_kernels.cpp"],
+    hdrs = ["triton_kernels.h"],
+    deps = [
+        ":cuda_gpu_kernel_helpers",
+        ":cuda_vendor",
+        ":triton_utils",
+        ":triton_cc_proto",
+        "@xla//xla/service:custom_call_target_registry",
+        "@xla//xla/service:custom_call_status",
+        "@xla//xla/stream_executor/gpu:asm_compiler",
+        "@xla//xla/tsl/cuda:cudart",
+        "@tsl//tsl/platform:env",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+    ],
+)