tenstorrent · arakhmati · Jul 2, 2024 · Jun 28, 2024 · ayerofieiev-tt · Jul 2, 2024
@@ -7,174 +7,98 @@ Adding New ttnn Operation
    Not all operations may be functional on all Tenstorrent hardware (Grayskull,
    Wormhole, or others).
 
-C++ Implementation
-------------------
-
-
-Add `tt_eager/tt_dnn/op_library/<new_operation>/<new_operation>.hpp`:
-
-.. code-block:: cpp
-
-    #pragma once
-
-    #include <optional>
-
-    #include "tensor/tensor.hpp"
-    #include "tt_dnn/op_library/operation.hpp"
-
-    namespace tt {
-    namespace tt_metal {
-
-    struct <NewOperation> {
-        bool some_arg;
-
-        // These methods are needed if the operation takes in input tensor and produces output tensors
-        void validate(const std::vector<Tensor> &input_tensors) const;
-        std::vector<Shape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
-        std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
-        operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const;
-
-        // This is needed until we get C++20
-        static constexpr auto attribute_names = std::forward_as_tuple("some_arg");
-        const auto attribute_values() const {
-            return std::forward_as_tuple(this->some_arg);
-        }
-    };
-
-    Tensor <new_operation>(const Tensor &input_tensor, bool some_arg);
 
-    }  // namespace tt_metal
-    }  // namespace tt
+What is a ttnn operation?
+-------------------------
 
+A ttnn operation is a function that takes in one or more input tensors and produces one or more output tensors. It is implemented in C++ and can be called from Python.
 
-.. note:
+What steps are needed to add ttnn operation in C++?
+---------------------------------------------------
+1. (Optional) Implement device operation in C++. Device operation is a struct that specifies how to create output tensors and a program to run on the device. If the ttnn operation is composed of other ttnn operations, then you can skip this step.
+2. Implement ttnn operation in C++ and register it using `ttnn::register_operation`.
 
-    If you need optional input tensors or would like to pass in optional output tensors, then refer to :doc:`Operations </ttnn/dependencies/tt_lib>` for how to write ops that use them
+What steps are needed to add ttnn operation in Python?
+------------------------------------------------------
+1. Take an existing registerd C++ operation and add a Python binding for it using `ttnn::bind_registered_operation`.
+2. In python, decorate the operation using `ttnn.register_operation`. (This step will be deprecated in the future)
 
 
-Add `tt_eager/tt_dnn/op_library/<new_operation>/<new_operation>.cpp`:
 
-.. code-block:: cpp
-
-    #include "tt_metal/host_api.hpp"
-    #include "tt_dnn/op_library/run_operation.hpp"
-
-    namespace tt {
-    namespace tt_metal {
-
-
-    void <NewOperation>::validate(const std::vector<Tensor> &input_tensors) const {
-        ...
-    }
-
-    std::vector<Shape> <NewOperation>::compute_output_shapes(const std::vector<Tensor> &input_tensors) const {
-        std::vector<Shape> output_shapes = ...;
-        return output_shapes;
-    }
-
-    std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const {
-        std::vector<Tensor> output_tensors = ...;
-        return output_tensors;
-    }
-
-    operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
-        Program program = ...;
-        return operation::ProgramWithCallbacks{program};
-    }
-
-    };
+C++ Implementation
+------------------
 
-    Tensor <new_operation>(const Tensor &input_tensor, bool some_arg) {
-        std::vector<Tensor> input_tensors = {input_tensor};
-        std::vector<Tensor> output_tensors operation::run(DeviceOperation(<NewOperation>{some_arg}, {input_tensor}));
-        return output_tensors[0];
-    }
+Step 1: Implement device operation (Optional)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    }  // namespace tt_metal
-    }  // namespace tt
+In order to add a new device operation, follow the directory structure shown below:
 
+`ttnn/cpp/ttnn/operations/<category>/<operation_name>/device/<operation_name>_device_operation.hpp`
+`ttnn/cpp/ttnn/operations/<category>/<operation_name>/device/<operation_name>_device_operation.cpp`
+`ttnn/cpp/ttnn/operations/<category>/<operation_name>/device/<program_factory_0>_program_factory.cpp`
 
-Add pybindings
---------------
+.. note::
+ Add as many program factories as needed
 
-In `tt_eager/tt_lib/csrc/tt_lib_bindings_tensor.cpp`, add the following lines
+A concrete example of a device operation can be found in `ttnn/cpp/ttnn/operations/examples/example/device`
 
-.. code-block:: cpp
+`ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.hpp`:
 
-    m_tensor.def("<new_operation>", &<new_operation>, py::arg("input_tensor").noconvert(), py::arg("some_arg").noconvert(), R"doc(
-        <NewOperation> runs new operation on input tensor.
+.. literalinclude::  examples/example/device/example_device_operation.hpp
 
-        .. csv-table::
-            :header: "Argument", "Description", "Data type", "Valid range", "Required"
+`ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.cpp`:
 
-            "input_tensor", "Input tensor", "Tensor", "Tensor of shape [W0, Z0, Y0, X0]", "Yes"
-            "some_arg", "Some arg", "bool", "Some arg to do some stuff in new operation", "Yes"
-    )doc");
+.. literalinclude::  examples/example/device/example_device_operation.cpp
 
+`ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp`:
 
+.. literalinclude::  examples/example/device/single_core_program_factory.cpp
 
-Adding a unit test
-------------------
+`ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp`:
 
-Add `tests/ttnn/unit_tests/ttl/test_<new_operation>.py`:
+.. literalinclude::  examples/example/device/multi_core_program_factory.cpp
 
-.. code-block:: python
 
-    import pytest
-    import torch
-    import ttnn
+Step 2: Implement the operation in C++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    from tests.ttnn.utils_for_testing import assert_with_pcc
+In order to add a new operation, add the following file:
 
-    @pytest.mark.parametrize("height", [32])
-    @pytest.mark.parametrize("width", [32])
-    def test_<new_operation>(device, height, width):
-        torch.manual_seed(0)
+`ttnn/cpp/ttnn/operations/<category>/<operation_name>/<operation_name>.hpp`
 
-        torch_input_tensor = torch.rand(1, 1, height, width)
-        torch_output_tensor = torch.exp(torch_input_tensor)
+A concrete example:
 
-        input_tensor = ttnn.from_torch(torch_input_tensor, device=device)
-        output_tensor = ttnn.experimental.tensor.<new_operation>(input_tensor)
+`ttnn/cpp/ttnn/operations/examples/example/example.hpp`:
 
-        output_tensor = ttnn.to_torch(output_tensor)
+.. literalinclude::  examples/example/example.hpp
 
-        assert_with_pcc(torch_output_tensor, output_tensor)
 
+Python Implementation
+---------------------
 
+Step 1: Add Python binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Adding a sweep test
--------------------
+In order to add a python binding for the operation, follow the directory structure shown below:
 
-Add `tests/ttnn/sweep_tests/sweeps/ttl_<new_operation>.py`:
+`ttnn/python/ttnn/operations/<category>/<operation_name>/<operation_name>_pybind.hpp`
+`ttnn/python/ttnn/operations/<category>/<category>_pybind.hpp`
 
-.. code-block:: python
+A concrete example:
 
-    from typing import Optional, Tuples
-    import torch
-    import ttnn
-    from tests.ttnn.utils_for_testing import check_with_pcc
+`ttnn/python/ttnn/operations/examples/example/example_pybind.hpp`:
 
+.. literalinclude::  examples/example/example_pybind.hpp
 
-    parameters = {
-        "height": [384, 1024],
-        "width": [1024, 4096],
-    }
+`ttnn/python/ttnn/operations/examples/examples_pybind.hpp`:
 
+.. literalinclude::  examples/example/example_pybind.hpp
 
-    def run(
-        height,
-        width,
-        *,
-        device,
-    ) -> Tuple[bool, Optional[str]]:
+Finally, call the module defined in `examples/example/example_pybind.hpp` wherever you want it to be added.
 
-        torch_input_tensor = torch.rand(1, 1, height, width)
-        torch_output_tensor = torch.exp(torch_input_tensor)
 
-        input_tensor = ttnn.from_torch(torch_input_tensor, device=device)
-        output_tensor = ttnn.experimental.tensor.<new_operation>(input_tensor)
 
-        output_tensor = ttnn.to_torch(output_tensor)
+Step 2: Register the operation in Python
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-        assert_with_pcc(torch_output_tensor, output_tensor)
+TODO: Add the description of how to register the operation in Python.
@@ -0,0 +1 @@
+../../../../ttnn/cpp/ttnn/operations/examples/
diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp
@@ -12,10 +12,8 @@
 #include "tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp"
 #include "tt_metal/common/constants.hpp"
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
-
-
-#include "ttnn/operations/eltwise/binary/device/binary_op.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
+#include "ttnn/operations/eltwise/binary/device/binary_device_operation.hpp"
 
 namespace tt {
 

@@ -385,7 +385,13 @@ inline std::string op_meta_data_serialized_json(
 
     j["optional_input_tensors"] = std::vector<json>{};
 
-    auto perfModel = operation_t::create_op_performance_model(operation_attributes, tensor_args, tensor_return_value);
+    auto perfModel = [&]() {
+        if constexpr (requires { operation_t::create_op_performance_model; }) {
+            return operation_t::create_op_performance_model(operation_attributes, tensor_args, tensor_return_value);
+        } else {
+            return operation::OpPerformanceModel{};
+        }
+    }();
     j["performance_model"]["compute_ns"] = perfModel.get_compute_ns();
     j["performance_model"]["ideal_ns"] = perfModel.get_ideal_ns();
     j["performance_model"]["bandwidth_ns"] = perfModel.get_bandwidth_ns();

@@ -5,12 +5,15 @@ set(TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/conv2d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/matmul.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/reduction/argmax/device/argmax_op.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/binary_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/unary/device/unary_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/examples/example/device/example_device_operation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp
 )
 

@@ -9,6 +9,7 @@
 
 #include "pybind11/operations/ccl.hpp"
 #include "pybind11/operations/conv2d.hpp"
+#include "pybind11/operations/copy.hpp"
 #include "pybind11/operations/core.hpp"
 #include "pybind11/operations/creation.hpp"
 #include "pybind11/operations/embedding.hpp"
@@ -17,12 +18,12 @@
 #include "pybind11/operations/maxpool2d.hpp"
 #include "pybind11/operations/normalization.hpp"
 #include "pybind11/operations/pool.hpp"
-#include "pybind11/operations/copy.hpp"
 #include "pybind11/operations/ternary.hpp"
 #include "pybind11/operations/transformer.hpp"
-
 #include "ttnn/operations/eltwise/binary/binary_pybind.hpp"
+#include "ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp"
 #include "ttnn/operations/eltwise/unary/unary_pybind.hpp"
+#include "ttnn/operations/examples/examples_pybind.hpp"
 #include "ttnn/operations/reduction/reduction_pybind.hpp"
 #include "ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp"
 #include "ttnn/operations/data_movement/data_movement_pybind.hpp"
@@ -35,6 +36,9 @@ namespace ttnn {
 namespace operations {
 
 void py_module(py::module& module) {
+    auto m_example = module.def_submodule("example", "example operation");
+    examples::py_module(m_example);
+
     auto m_unary = module.def_submodule("unary", "unary operations");
     unary::py_module(m_unary);
 

@@ -20,14 +20,14 @@ namespace ttnn {
 
 namespace device_operation {
 
-template <typename program_attributes_t>
+template <typename shared_variables_t>
 struct CachedProgram {
     tt::tt_metal::Program program;
-    // Cached program needs to share program_attributes between create and override_runtime_arguments functions
-    program_attributes_t program_attributes;
+    // Cached program needs to share shared_variables between create and override_runtime_arguments functions
+    shared_variables_t shared_variables;
 
-    CachedProgram(tt::tt_metal::Program&& program, program_attributes_t&& program_attributes) :
-        program{std::move(program)}, program_attributes{program_attributes} {}
+    CachedProgram(tt::tt_metal::Program&& program, shared_variables_t&& shared_variables) :
+        program{std::move(program)}, shared_variables{shared_variables} {}
 };
 
 struct CachedProgramFactory {
@@ -38,8 +38,8 @@ struct CachedProgramFactory {
     // program_factory_index is used to map a runtime value to a program factory type that is being used
     std::size_t program_factory_index;
 
-    template <typename program_attributes_t>
-    CachedProgramFactory(CachedProgram<program_attributes_t>&& cached_program, std::size_t program_factory_index) :
+    template <typename shared_variables_t>
+    CachedProgramFactory(CachedProgram<shared_variables_t>&& cached_program, std::size_t program_factory_index) :
         cached_program{std::move(cached_program)}, program_factory_index{program_factory_index} {}
 };
 
@@ -91,7 +91,7 @@ concept DeviceOperationWithCustomProgramCacheConcept = DeviceOperationConcept<de
 template <typename... Ts>
 [[nodiscard]] std::variant<Ts...> constexpr map_index_to_variant(std::size_t i, std::variant<Ts...>) {
     assert(i < sizeof...(Ts));
-    static constexpr std::variant<Ts...> table[] = { Ts{ }... };
+    static constexpr std::variant<Ts...> table[] = {Ts{}...};
     return table[i];
 }