From 354c04767231553693e7d78985e34cb321bf1e18 Mon Sep 17 00:00:00 2001
From: Bharane AB <bharane.amutha@multicorewareinc.com>
Date: Sat, 27 Jul 2024 14:42:33 +0000
Subject: [PATCH] #10778: Update argmax op with ttnn support

---
 docs/source/ttnn/ttnn/api.rst                 |   1 +
 docs/source/ttnn/ttnn/dependencies/tt_lib.rst |   2 -
 docs/source/ttnn/ttnn/ttnn/logical_not_.rst   |   6 +
 .../grayskull/test_argmax_padding.py          |   2 +-
 tests/ttnn/profiling/ops_for_profiling.py     |  18 +--
 .../op_library/composite/composite_ops.cpp    | 127 ------------------
 .../op_library/composite/composite_ops.hpp    |   6 -
 .../tt_lib_bindings_tensor_composite_ops.cpp  |  24 ----
 8 files changed, 17 insertions(+), 169 deletions(-)
 create mode 100644 docs/source/ttnn/ttnn/ttnn/logical_not_.rst

diff --git a/docs/source/ttnn/ttnn/api.rst b/docs/source/ttnn/ttnn/api.rst
index a2a0b38c9dd5..282439ba2d98 100644
--- a/docs/source/ttnn/ttnn/api.rst
+++ b/docs/source/ttnn/ttnn/api.rst
@@ -75,6 +75,7 @@ Pointwise Unary
 
    ttnn/abs
    ttnn/acos
+   ttnn/logical_not_
    ttnn/acosh
    ttnn/asin
    ttnn/asinh
diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
index d13bba05a175..82d8e92c54d7 100644
--- a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
+++ b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
@@ -529,8 +529,6 @@ Other Operations
 
 .. autofunction:: tt_lib.tensor.repeat
 
-.. autofunction:: tt_lib.tensor.argmax
-
 Loss Functions
 ==============
 
diff --git a/docs/source/ttnn/ttnn/ttnn/logical_not_.rst b/docs/source/ttnn/ttnn/ttnn/logical_not_.rst
new file mode 100644
index 000000000000..b9078c4f7294
--- /dev/null
+++ b/docs/source/ttnn/ttnn/ttnn/logical_not_.rst
@@ -0,0 +1,6 @@
+.. _ttnn.logical_not_:
+
+ttnn.logical_not_
+###################
+
+.. autofunction:: ttnn.logical_not_
diff --git a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py b/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
index fc52f8357071..1059257803e9 100644
--- a/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
+++ b/tests/tt_eager/python_api_testing/non_working_unit_tests/grayskull/test_argmax_padding.py
@@ -31,7 +31,7 @@ def test_argmax(self, input_shapes, dim, all, device):
             .to(tt_lib.tensor.Layout.TILE)
             .to(device)
         )
-        tt_output_tensor_on_device = tt_lib.tensor.argmax(input_tensor, dim=dim, all=all)
+        tt_output_tensor_on_device = ttnn.experimental.argmax(input_tensor, dim=dim, all=all)
         tt_out_tensor = tt_output_tensor_on_device.cpu().to(tt_lib.tensor.Layout.ROW_MAJOR).to_torch()
         if all:
             golden_tensor = torch.argmax(input_data)
diff --git a/tests/ttnn/profiling/ops_for_profiling.py b/tests/ttnn/profiling/ops_for_profiling.py
index 8101448cf346..aa9d454dfd2b 100644
--- a/tests/ttnn/profiling/ops_for_profiling.py
+++ b/tests/ttnn/profiling/ops_for_profiling.py
@@ -1516,23 +1516,23 @@ def pow_float(x):
 
 
 def argmax_1(x):
-    tt_lib.tensor.argmax(x, dim=-1)
+    ttnn.argmax(x, dim=-1)
 
 
 def argmax_2(x):
-    tt_lib.tensor.argmax(x, dim=-2)
+    ttnn.argmax(x, dim=-2)
 
 
 def argmax_3(x):
-    tt_lib.tensor.argmax(x, dim=-3)
+    ttnn.argmax(x, dim=-3)
 
 
 def argmax_4(x):
-    tt_lib.tensor.argmax(x, dim=-4)
+    ttnn.argmax(x, dim=-4)
 
 
 def argmax_all(x):
-    tt_lib.tensor.argmax(x, dim=-1, all=True)
+    ttnn.argmax(x, dim=-1, all=True)
 
 
 def argmin_1(x):
@@ -2264,22 +2264,22 @@ def clone(x):
     },
     {
         "op": argmax_1,
-        "name": "tt_lib.tensor.argmax_dim_3",
+        "name": "ttnn.argmax_dim_3",
         "num_repeats": 2,
     },
     {
         "op": argmax_2,
-        "name": "tt_lib.tensor.argmax_dim_2",
+        "name": "ttnn.argmax_dim_2",
         "num_repeats": 2,
     },
     {
         "op": argmax_3,
-        "name": "tt_lib.tensor.argmax_dim_1",
+        "name": "ttnn.argmax_dim_1",
         "num_repeats": 2,
     },
     {
         "op": argmax_all,
-        "name": "tt_lib.tensor.argmax_all",
+        "name": "ttnn.argmax_all",
         "num_repeats": 2,
     },
     {
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.cpp
index 6fd2570a8911..5c2b32b0c83b 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.cpp
@@ -733,14 +733,6 @@ Tensor sfpu_eps(const Shape shape, Layout layout, Device* device, const MemoryCo
     return operation::decorate_as_composite(__func__, _sfpu_eps)(shape, layout, device, output_mem_config);
 }
 
-Tensor triu(
-    const Tensor& input_a,
-    int32_t dim /* = -1 */,
-    const MemoryConfig& output_mem_config /* = operation::DEFAULT_OUTPUT_MEMORY_CONFIG */) {
-    return operation::decorate_as_composite(__func__, _triu)(input_a, dim, output_mem_config);
-}
-
-
 Tensor create_mask(const Tensor& input_a, const MemoryConfig& output_mem_config) {
     auto& padded_shape = input_a.get_legacy_shape();
     auto& unpadded_shape = padded_shape.without_padding();
@@ -751,125 +743,6 @@ Tensor create_mask(const Tensor& input_a, const MemoryConfig& output_mem_config)
     masked_input = ttnn::where(masked_input, input_a, t_inf, output_mem_config);
     return masked_input;
 }
-// Argmax returns the index of maximum element in the tensor
-Tensor _argmax(const Tensor& input_t, int64_t _dim, bool all, const MemoryConfig& output_mem_config) {
-    std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_t}))};
-    operation::launch_with_autoformat(
-        [_dim, all, output_mem_config](
-            const std::vector<Tensor>& input_tensors,
-            const std::vector<std::optional<const Tensor>>& optional_input_tensors,
-            const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
-            const auto& input = input_tensors.at(0);
-            auto& input_shape = input.get_legacy_shape();
-            TT_FATAL(input_shape.rank() == 4, "supported for rank-4 tensors at this time");
-
-            Tensor input_a = create_mask(input, output_mem_config);
-
-            uint32_t dim = input_shape.get_normalized_index(_dim);
-            int size = input_a.volume();
-
-            if (!all) {
-                if ((dim == (input_shape.rank() - 1)) || (dim == (input_shape.rank() - 2))) {
-                    bool is_width = (dim == (input_shape.rank() - 1));
-                    Tensor max_val = max(input_a, dim, output_mem_config);
-                    Tensor max_tensor = zeros_like(input_a, output_mem_config);
-                    Tensor tindex = tt::numpy::index_width<bfloat16>(
-                        input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_mem_config);
-                    if (is_width) {
-                        max_tensor = ttnn::add(max_tensor, max_val, std::nullopt, output_mem_config);
-                    } else {
-                        tindex = tt::numpy::index_height<bfloat16>(
-                            input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_mem_config);
-                        max_tensor = ttnn::add(max_tensor, max_val, std::nullopt, output_mem_config);
-                    }
-                    tindex = tindex.to(input_a.device());
-                    max_val.deallocate();
-                    Tensor cmp_results = ttnn::eq(input_a, max_tensor, std::nullopt, output_mem_config);
-                    max_tensor.deallocate();
-                    Tensor max_indices = ttnn::multiply(cmp_results, tindex, std::nullopt, output_mem_config);
-                    cmp_results.deallocate();
-                    Tensor result = ttnn::where(ttnn::eqz(max_indices), size, max_indices, output_mem_config);
-                    max_indices.deallocate();
-                    result = min(result, dim, output_mem_config);
-                    Tensor res_index = zeros_like(result, output_mem_config);
-                    result = ttnn::where(ttnn::eq(result, size), res_index, result, output_mem_config);
-                    std::vector<int64_t> permute_dims = {3, 0, 1, 2};
-                    if (is_width) {
-                        res_index = ttnn::add(res_index, result, std::nullopt, output_mem_config);
-                    } else {
-                        res_index = ttnn::add(res_index, result, std::nullopt, output_mem_config);
-                        permute_dims[0] = 2;
-                        permute_dims[3] = 3;
-                    }
-                    result.deallocate();
-                    Tensor transpose_res = ttnn::permute(res_index, permute_dims, output_mem_config);
-                    return {transpose_res};
-                } else if ((dim == (input_shape.rank() - 3)) || (dim == (input_shape.rank() - 4))) {
-                    bool is_channel = (dim == (input_shape.rank() - 3));
-                    Tensor max_val = max(input_a, dim, output_mem_config);
-                    int repeat = input.get_shape()[dim];
-                    std::vector<Tensor> combined_tensors;
-                    for (int cid = 0; cid < repeat; cid++) combined_tensors.emplace_back(max_val);
-                    max_val.deallocate();
-                    Tensor concat_out = concat(combined_tensors, dim, output_mem_config);
-                    // Needed till `max` stops autoformatting output
-                    concat_out = ttnn::reshape(concat_out, input_a.get_shape());
-                    Tensor cmp_results = ttnn::eq(input_a, concat_out, std::nullopt, output_mem_config);
-                    concat_out.deallocate();
-                    Tensor tindex = tt::numpy::index_channel<bfloat16>(
-                        input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_mem_config);
-                    if (!is_channel) {
-                        tindex = tt::numpy::index_batch<bfloat16>(
-                            input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_mem_config);
-                    }
-                    tindex = tindex.to(input_a.device());
-                    Tensor max_indices = ttnn::multiply(cmp_results, tindex, std::nullopt, output_mem_config);
-                    cmp_results.deallocate();
-                    Tensor midx = full_like(max_indices, size);
-                    Tensor result = ttnn::where(ttnn::eqz(max_indices), midx, max_indices, output_mem_config);
-                    max_indices.deallocate();
-                    result = min(result, dim, output_mem_config);
-                    Tensor res_index = zeros_like(result, output_mem_config);
-                    result = ttnn::where(ttnn::eq(result, full_like(result, size)), res_index, result, output_mem_config);
-                    res_index.deallocate();
-                    if (is_channel) {
-                        std::vector<int64_t> permute_dims = {1, 0, 2, 3};
-                        Tensor transpose_res = ttnn::permute(result, permute_dims, output_mem_config);
-                        return {transpose_res};
-                    } else {
-                        return {result};
-                    }
-                }
-            }
-            // TODO: Fix the index generation code. With the fix the code will work for argmax that return entire
-            // maximum value index
-            Tensor tindex = tt::numpy::index_all<bfloat16>(
-                input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_mem_config);
-            Tensor max_val = global_max(input_a, output_mem_config);
-            Tensor max_tensor = zeros_like(input_a, output_mem_config);
-            max_tensor = ttnn::add(max_tensor, max_val, std::nullopt, output_mem_config);
-            max_val.deallocate();
-            Tensor cmp_results = ttnn::eq(input_a, max_tensor, std::nullopt, output_mem_config);
-            max_tensor.deallocate();
-            Tensor max_indices = ttnn::multiply(cmp_results, tindex, std::nullopt, output_mem_config);
-            cmp_results.deallocate();
-            Tensor result = ttnn::where(ttnn::eqz(max_indices), size, max_indices, output_mem_config);
-            max_indices.deallocate();
-            result = global_min(result, output_mem_config);
-            return {result};
-        },
-        {input_t},
-        output_tensors);
-    return output_tensors.at(0);
-}
-
-Tensor argmax(
-    const Tensor& input_a,
-    int64_t dim,
-    bool all,
-    const MemoryConfig& output_mem_config /* = operation::DEFAULT_OUTPUT_MEMORY_CONFIG */) {
-    return operation::decorate_as_composite(__func__, _argmax)(input_a, dim, all, output_mem_config);
-}
 
 }  // namespace tt_metal
 
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.hpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.hpp
index 42bf420446af..c0a342e4fceb 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.hpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/composite/composite_ops.hpp
@@ -276,12 +276,6 @@ Tensor logical_ori(
 // on-device tensor creation with shape and filled with value
 Tensor sfpu_eps(const Shape shape, Layout layout, Device* device, const MemoryConfig& output_mem_config);
 
-Tensor argmax(
-    const Tensor& input_a,
-    int64_t dim = 0,
-    bool all = false,
-    const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG);
-
 }  // namespace tt_metal
 
 }  // namespace tt
diff --git a/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp b/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp
index db5a39c16846..ed1740d4b515 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp
@@ -118,30 +118,6 @@ void TensorModuleCompositeOPs(py::module& m_tensor) {
         R"doc(Perform an eltwise logical OR (``{0} || {1}``) on input tensor and immediate value.)doc",
         R"doc("Scalar", "float", "")doc");
 
-    m_tensor.def(
-        "argmax",
-        &argmax,
-        py::arg("input").noconvert(),
-        py::arg("dim"),
-        py::arg("all") = false,
-        py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
-        R"doc(
-            Returns the indices of the maximum value of elements in the ``input`` tensor
-            If ``all`` is set to ``true`` irrespective of given dimension it will return the indices of maximum value of all elements in given ``input``
-
-            Input tensor must have BFLOAT16 data type.
-
-            Output tensor will have BFLOAT16 data type.
-
-            .. csv-table::
-                :header: "Argument", "Description", "Data type", "Valid range", "Required"
-
-                "input", "Tensor argmax is applied to", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes"
-                "dim", "Dimension to perform argmax", "int", "", "Yes"
-                "all", "Consider all dimension (ignores ``dim`` param)", "bool", "default to false", "No"
-                "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No"
-        )doc");
-
     m_tensor.def(
         "lerp",
         py::overload_cast<const Tensor&, const Tensor&, float, const MemoryConfig&>(&lerp),