From 28e3825b1bb1e3728aba88ffc7ba47429cd432e4 Mon Sep 17 00:00:00 2001
From: Mouliraj Elamurugan <mcw-melamurugan@ext.tenstorrent.com>
Date: Thu, 26 Sep 2024 11:48:15 +0530
Subject: [PATCH] #10033: Add forward support for gcd and lcm (#10241)

* #10033: Add forward support for gcd

* #10033: Add forward support for lcm

* #10033: Update gcd and lcm
---
 docs/source/ttnn/ttnn/api.rst                 |  2 +
 .../operations/backward/utility_funcs.py      | 13 +++++
 .../operations/test_binary_composite.py       | 48 ++++++++++++++++++-
 .../eltwise/binary/binary_composite.hpp       | 18 ++++++-
 .../eltwise/binary/binary_pybind.hpp          | 16 +++++++
 .../binary/device/binary_composite_op.cpp     | 30 +++++++++++-
 ttnn/ttnn/operations/binary.py                | 18 +++++++
 7 files changed, 141 insertions(+), 4 deletions(-)

diff --git a/docs/source/ttnn/ttnn/api.rst b/docs/source/ttnn/ttnn/api.rst
index 8fa742f6d6f..9e0d10a889b 100644
--- a/docs/source/ttnn/ttnn/api.rst
+++ b/docs/source/ttnn/ttnn/api.rst
@@ -302,6 +302,8 @@ Pointwise Binary
    ttnn.floor_div
    ttnn.remainder
    ttnn.fmod
+   ttnn.gcd
+   ttnn.lcm
    ttnn.logical_and_
    ttnn.logical_or_
    ttnn.logical_xor_
diff --git a/tests/ttnn/unit_tests/operations/backward/utility_funcs.py b/tests/ttnn/unit_tests/operations/backward/utility_funcs.py
index b45fd6c6d8a..4835f501418 100644
--- a/tests/ttnn/unit_tests/operations/backward/utility_funcs.py
+++ b/tests/ttnn/unit_tests/operations/backward/utility_funcs.py
@@ -29,6 +29,19 @@ def data_gen_with_range(input_shapes, low, high, device, required_grad=False, is
     return pt_tensor, tt_tensor
 
 
+def data_gen_with_range_int(input_shapes, low, high, device, required_grad=False, is_row_major=False):
+    assert high > low, "Incorrect range provided"
+    torch.manual_seed(213919)
+    pt_tensor = torch.randint(low, high, input_shapes, dtype=torch.int32, requires_grad=required_grad)
+
+    if is_row_major:
+        tt_tensor = ttnn.Tensor(pt_tensor, ttnn.float32).to(ttnn.ROW_MAJOR_LAYOUT).to(device)
+    else:
+        tt_tensor = ttnn.Tensor(pt_tensor, ttnn.float32).to(ttnn.TILE_LAYOUT).to(device)
+
+    return pt_tensor, tt_tensor
+
+
 def data_gen_with_val(input_shapes, device, required_grad=False, val=1, is_row_major=False):
     pt_tensor = (torch.ones(input_shapes, requires_grad=required_grad) * val).bfloat16()
     if is_row_major:
diff --git a/tests/ttnn/unit_tests/operations/test_binary_composite.py b/tests/ttnn/unit_tests/operations/test_binary_composite.py
index 5c64744fd2f..f5460e85cf9 100644
--- a/tests/ttnn/unit_tests/operations/test_binary_composite.py
+++ b/tests/ttnn/unit_tests/operations/test_binary_composite.py
@@ -6,8 +6,13 @@
 import pytest
 import random
 import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc, compare_equal
-from models.utility_functions import is_grayskull, skip_for_grayskull, skip_for_wormhole_b0
+from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+    data_gen_with_range,
+    data_gen_with_range_int,
+    compare_pcc,
+    compare_equal,
+)
+from models.utility_functions import is_grayskull, skip_for_grayskull
 
 
 @pytest.mark.parametrize(
@@ -845,4 +850,43 @@ def test_nei_ttnn(input_shapes, scalar, device):
     golden_tensor = golden_function(in_data, scalar)
 
     comp_pass = compare_equal([input_tensor], [golden_tensor])
+
+
+@pytest.mark.parametrize(
+    "input_shapes",
+    (
+        (torch.Size([1, 1, 32, 32])),
+        (torch.Size([1, 1, 320, 384])),
+        (torch.Size([1, 3, 320, 384])),
+    ),
+)
+@skip_for_grayskull("#ToDo: GS implementation needs to be done for remainder")
+def test_binary_gcd_ttnn(input_shapes, device):
+    in_data1, input_tensor1 = data_gen_with_range_int(input_shapes, -1024, 1024, device)
+    in_data2, input_tensor2 = data_gen_with_range_int(input_shapes, -1024, 1024, device)
+    output_tensor = ttnn.gcd(input_tensor1, input_tensor2)
+    golden_function = ttnn.get_golden_function(ttnn.gcd)
+    golden_tensor = golden_function(in_data1, in_data2)
+
+    comp_pass = compare_pcc([output_tensor], [golden_tensor])
+    assert comp_pass
+
+
+@pytest.mark.parametrize(
+    "input_shapes",
+    (
+        (torch.Size([1, 1, 32, 32])),
+        (torch.Size([1, 1, 320, 384])),
+        (torch.Size([1, 3, 320, 384])),
+    ),
+)
+@skip_for_grayskull("#ToDo: GS implementation needs to be done for remainder")
+def test_binary_lcm_ttnn(input_shapes, device):
+    in_data1, input_tensor1 = data_gen_with_range_int(input_shapes, -1024, 1024, device)
+    in_data2, input_tensor2 = data_gen_with_range_int(input_shapes, -1024, 1024, device)
+    output_tensor = ttnn.lcm(input_tensor1, input_tensor2)
+    golden_function = ttnn.get_golden_function(ttnn.lcm)
+    golden_tensor = golden_function(in_data1, in_data2)
+
+    comp_pass = compare_pcc([output_tensor], [golden_tensor])
     assert comp_pass
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
index 870a3c20f60..2667c848bf3 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
@@ -201,7 +201,17 @@ struct ExecuteBinaryRemainder
         const std::optional<MemoryConfig>& memory_config = std::nullopt);
 };
 
-}  // namespace binary
+#define DEFINE_BINARY_COMPOSITE(op_name) \
+struct Execute##op_name { \
+    static Tensor invoke( \
+        const Tensor& input_tensor_a, \
+        const Tensor& input_tensor_b, \
+        const std::optional<MemoryConfig>& memory_config = std::nullopt); \
+};
+DEFINE_BINARY_COMPOSITE(LCM)
+DEFINE_BINARY_COMPOSITE(GCD)
+
+} // namespace binary
 }  // namespace operations
 
 constexpr auto hypot = ttnn::register_operation_with_auto_launch_op<
@@ -264,5 +274,11 @@ constexpr auto outer = ttnn::register_operation_with_auto_launch_op<
 constexpr auto polyval = ttnn::register_operation_with_auto_launch_op<
     "ttnn::polyval",
     operations::binary::ExecuteBinaryCompositeOpsPolyval<operations::binary::BinaryCompositeOpType::POLYVAL>>();
+constexpr auto gcd = ttnn::register_operation_with_auto_launch_op<
+    "ttnn::gcd",
+    operations::binary::ExecuteGCD>();
+constexpr auto lcm = ttnn::register_operation_with_auto_launch_op<
+    "ttnn::lcm",
+    operations::binary::ExecuteLCM>();
 
 }  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
index 35405fb9363..d97d5c7a31f 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
@@ -879,6 +879,22 @@ void py_module(py::module& module) {
         ttnn::logical_and_,
         R"doc(Compute inplace logical AND of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc");
 
+    detail::bind_binary_composite(
+        module,
+        ttnn::gcd,
+        R"doc(Compute Greatest common divisor of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`.
+        [supported range -1024 to 1024].)doc",
+        R"doc(\mathrm{output\_tensor}_i = \text{gcd}\left(\mathrm{input\_tensor\_a}_i , \mathrm{input\_tensor\_b}_i\right)
+        )doc");
+
+    detail::bind_binary_composite(
+        module,
+        ttnn::lcm,
+        R"doc(Compute Least common multiple of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`.
+        [supported range -1024 to 1024].)doc",
+        R"doc(\mathrm{output\_tensor}_i = \text{lcm}\left(\mathrm{input\_tensor\_a}_i , \mathrm{input\_tensor\_b}_i\right)
+        )doc");
+
     detail::bind_binary_composite_with_alpha(
         module,
         ttnn::addalpha,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
index 61b32f80db3..e1c88963491 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -272,7 +272,6 @@ Tensor ExecuteBinaryRemainder::invoke(const Tensor& input_a, const Tensor& input
     return typecast(result, input_dtype);
 }
 
-
 Tensor ExecuteBinaryRemainder::invoke(const Tensor& input, float scalar, const std::optional<MemoryConfig>& output_mem_config) {
     return ttnn::unary_remainder(input, scalar);
 }
@@ -400,4 +399,33 @@ Tensor _polyval(const Tensor& input_a, const std::vector<float>& coeffs, const s
     return final_tensor;
 }
 
+Tensor ExecuteGCD::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
+    Tensor input_a_abs = ttnn::abs(input_a);
+    Tensor input_b_abs = ttnn::abs(input_b);
+    Tensor a_gt_b = ttnn::gt(input_a_abs, input_b_abs);
+    Tensor min = ttnn::where(a_gt_b, input_b_abs, input_a_abs);
+    Tensor max = ttnn::where(a_gt_b, input_a_abs, input_b_abs);
+    a_gt_b.deallocate();
+    // https://en.wikipedia.org/wiki/Lam%C3%A9%27s_theorem
+    // While 186 is the theoretical maximum iterations for numbers within the floating point range according to Lame's
+    // theorem, in practice when evaluating gcd of consecutive Fibonacci numbers coerced to floating point, the
+    // maximum number of iterations reached is only 14 because the remainder converges to 0 much more quickly. In
+    // addition, limited precision in bfloat16 format decreases support for input to the range [-1024, 1024]
+    constexpr std::size_t max_iterations = 14;
+    for (std::size_t iteration = 0; iteration < max_iterations; ++iteration) {
+        Tensor isz = ttnn::eqz(min);
+        Tensor rem = ttnn::remainder(max, ttnn::where(isz, isz, min));
+        max = ttnn::where(isz, max, min);
+        min = rem;
+    }
+    return max;
+}
+
+Tensor ExecuteLCM::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
+    Tensor val = ttnn::multiply(input_a, input_b, std::nullopt, output_mem_config);
+    Tensor tmp_result = ttnn::gcd(input_a, input_b);
+    Tensor result = ttnn::div(val, tmp_result, false, "None", output_mem_config);
+    return ttnn::abs(result);
+}
+
 } // namespace ttnn::operations::binary
diff --git a/ttnn/ttnn/operations/binary.py b/ttnn/ttnn/operations/binary.py
index 77c7c617abc..ce8b02488f0 100644
--- a/ttnn/ttnn/operations/binary.py
+++ b/ttnn/ttnn/operations/binary.py
@@ -437,4 +437,22 @@ def _golden_function_ne_(input_tensor_a, input_tensor_b, *args, **kwargs):
 ttnn.attach_golden_function(ttnn.ne_, golden_function=_golden_function_ne_)
 
 
+def _golden_function_gcd(input_tensor_a, input_tensor_b, *args, **kwargs):
+    import torch
+
+    return torch.gcd(input_tensor_a, input_tensor_b)
+
+
+ttnn.attach_golden_function(ttnn.gcd, golden_function=_golden_function_gcd)
+
+
+def _golden_function_lcm(input_tensor_a, input_tensor_b, *args, **kwargs):
+    import torch
+
+    return torch.lcm(input_tensor_a, input_tensor_b)
+
+
+ttnn.attach_golden_function(ttnn.lcm, golden_function=_golden_function_lcm)
+
+
 __all__ = []