diff --git a/ttnn/cpp/pybind11/__init__.cpp b/ttnn/cpp/pybind11/__init__.cpp index ac209e3caeb..4f263b3760f 100644 --- a/ttnn/cpp/pybind11/__init__.cpp +++ b/ttnn/cpp/pybind11/__init__.cpp @@ -48,7 +48,7 @@ PYBIND11_MODULE(_ttnn, module) { auto m_operations = module.def_submodule("operations", "ttnn Operations"); // TYPES - ttnn::tensor::py_module_types(m_tensor); + ttnn::tensor::tensor_mem_config_module_types(m_tensor); ttnn::tensor::pytensor_module_types(m_tensor); ttnn::graph::py_graph_module_types(m_graph); @@ -63,7 +63,7 @@ PYBIND11_MODULE(_ttnn, module) { ttnn::reports::py_module_types(m_reports); // FUNCTIONS / OPERATIONS - ttnn::tensor::py_module(m_tensor); + ttnn::tensor::tensor_mem_config_module(m_tensor); ttnn::tensor::pytensor_module(m_tensor); ttnn::core::py_module(m_core); ttnn::graph::py_graph_module(m_graph); diff --git a/ttnn/cpp/pybind11/device.hpp b/ttnn/cpp/pybind11/device.hpp index 165224d0d67..9a57c4d1fae 100644 --- a/ttnn/cpp/pybind11/device.hpp +++ b/ttnn/cpp/pybind11/device.hpp @@ -12,7 +12,7 @@ namespace py = pybind11; namespace ttnn::device { - void py_device_module_types(py::module& module); - void py_device_module(py::module& module); +void py_device_module_types(py::module& module); +void py_device_module(py::module& module); } // namespace ttnn::device diff --git a/ttnn/cpp/pybind11/events.hpp b/ttnn/cpp/pybind11/events.hpp index ff4bb17b28e..6d308a88512 100644 --- a/ttnn/cpp/pybind11/events.hpp +++ b/ttnn/cpp/pybind11/events.hpp @@ -11,7 +11,7 @@ namespace py = pybind11; namespace ttnn::events { - void py_module_types(py::module& module); - void py_module(py::module& module); +void py_module_types(py::module& module); +void py_module(py::module& module); } // namespace ttnn::events diff --git a/ttnn/cpp/pybind11/profiler.hpp b/ttnn/cpp/pybind11/profiler.hpp index d2ab92753af..d6c9cf3414f 100644 --- a/ttnn/cpp/pybind11/profiler.hpp +++ b/ttnn/cpp/pybind11/profiler.hpp @@ -10,6 +10,6 @@ namespace py = pybind11; namespace ttnn::profiler { - void py_module(py::module& module); +void py_module(py::module& module); } // namespace ttnn::profiler diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp index 2e316f30250..feddb572634 100644 --- a/ttnn/cpp/pybind11/pytensor.cpp +++ b/ttnn/cpp/pybind11/pytensor.cpp @@ -645,961 +645,961 @@ Tensor convert_python_tensors_to_tt_tensors(py::list tensor_shards, std::optiona } // namespace detail - void pytensor_module_types(py::module &m_tensor) { - using tt::tt_metal::Shape; - // Tensor constructors that accept device and .to(device) function use keep alive call policy to communicate that Device needs to outlive Tensor. - // This is because when tensors on device are destroyed they need to deallocate their buffers via device. - // keep_alive increases the ref count of the Device object being passed into the constructor and .to() function. - // For additional info see: https://pybind11.readthedocs.io/en/stable/advanced/functions.html#keep-alive - auto pyTensor = py::class_(m_tensor, "Tensor", R"doc( - - Class constructor supports tensors of rank 4. - The constructor takes following arguments: - - +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +============+========================================================+===========================+====================================+==========+ - | data | Data to store in TT tensor | List[float/int] | | Yes | - +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ - | shape | Shape of TT tensor | List[int[4]] | | Yes | - +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ - | data_type | Data type of numbers in TT tensor | ttnn.DataType | ttnn.DataType.BFLOAT16 | Yes | - | | | | | | - | | | | ttnn.DataType.FLOAT32 | | - | | | | | | - | | | | ttnn.DataType.UINT32 | | - | | | | | | - | | | | ttnn.DataType.BFLOAT8_B | | - | | | | | | - | | | | ttnn.DataType.BFLOAT4_B | | - +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ - | layout | Layout of tensor data in memory | ttnn.Layout | ttnn.Layout.ROW_MAJOR | Yes | - | | | | | | - | | | | ttnn.Layout.TILE | | - +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ - | device | Device on which tensor will be created | ttnn.Device | Host or TT accelerator device | No | - +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ - | mem_config | Layout of tensor in TT Accelerator device memory banks | ttnn.MemoryConfig | | No | - +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ - - )doc"); - } +void pytensor_module_types(py::module &m_tensor) { + using tt::tt_metal::Shape; + // Tensor constructors that accept device and .to(device) function use keep alive call policy to communicate that Device needs to outlive Tensor. + // This is because when tensors on device are destroyed they need to deallocate their buffers via device. + // keep_alive increases the ref count of the Device object being passed into the constructor and .to() function. + // For additional info see: https://pybind11.readthedocs.io/en/stable/advanced/functions.html#keep-alive + auto pyTensor = py::class_(m_tensor, "Tensor", R"doc( + + Class constructor supports tensors of rank 4. + The constructor takes following arguments: + + +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +============+========================================================+===========================+====================================+==========+ + | data | Data to store in TT tensor | List[float/int] | | Yes | + +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ + | shape | Shape of TT tensor | List[int[4]] | | Yes | + +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ + | data_type | Data type of numbers in TT tensor | ttnn.DataType | ttnn.DataType.BFLOAT16 | Yes | + | | | | | | + | | | | ttnn.DataType.FLOAT32 | | + | | | | | | + | | | | ttnn.DataType.UINT32 | | + | | | | | | + | | | | ttnn.DataType.BFLOAT8_B | | + | | | | | | + | | | | ttnn.DataType.BFLOAT4_B | | + +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ + | layout | Layout of tensor data in memory | ttnn.Layout | ttnn.Layout.ROW_MAJOR | Yes | + | | | | | | + | | | | ttnn.Layout.TILE | | + +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ + | device | Device on which tensor will be created | ttnn.Device | Host or TT accelerator device | No | + +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ + | mem_config | Layout of tensor in TT Accelerator device memory banks | ttnn.MemoryConfig | | No | + +------------+--------------------------------------------------------+---------------------------+------------------------------------+----------+ + + )doc"); +} - void pytensor_module(py::module &m_tensor) { - m_tensor.def( - "decorate_external_operation", - [](const py::function &function, std::optional function_name) -> py::function { - return py::cpp_function(std::function([function, function_name]( - const py::args &args, const py::kwargs &kwargs) { - ZoneScopedN("TT_DNN_FALLBACK_OP"); - uint32_t device_operation_id = ttnn::CoreIDs::instance().fetch_and_increment_device_operation_id(); - auto [operation, input_tensors] = detail::parse_external_operation(function, args, kwargs, function_name); - GraphTracker::instance().track_function_start(operation.get_type_name(), args, kwargs); - detail::log_external_operation(ttnn::CoreIDs::instance().get_python_operation_id(), device_operation_id, operation, input_tensors); - - auto output = function(*args, **kwargs); - - TracyOpTTNNExternal(device_operation_id, operation, input_tensors); - GraphTracker::instance().track_function_end(output); - return output; - })); - }, - py::arg("function").noconvert(), - py::arg("function_name").noconvert() = std::nullopt, +void pytensor_module(py::module &m_tensor) { + m_tensor.def( + "decorate_external_operation", + [](const py::function &function, std::optional function_name) -> py::function { + return py::cpp_function(std::function([function, function_name]( + const py::args &args, const py::kwargs &kwargs) { + ZoneScopedN("TT_DNN_FALLBACK_OP"); + uint32_t device_operation_id = ttnn::CoreIDs::instance().fetch_and_increment_device_operation_id(); + auto [operation, input_tensors] = detail::parse_external_operation(function, args, kwargs, function_name); + GraphTracker::instance().track_function_start(operation.get_type_name(), args, kwargs); + detail::log_external_operation(ttnn::CoreIDs::instance().get_python_operation_id(), device_operation_id, operation, input_tensors); + + auto output = function(*args, **kwargs); + + TracyOpTTNNExternal(device_operation_id, operation, input_tensors); + GraphTracker::instance().track_function_end(output); + return output; + })); + }, + py::arg("function").noconvert(), + py::arg("function_name").noconvert() = std::nullopt, + R"doc( + Decorate external operation for purposes of reporting and profiling. + + +----------+----------------------+-----------+-------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +==========+======================+===========+=============+==========+ + | function | Fallback Operation | Function | | Yes | + +----------+----------------------+-----------+-------------+----------+ + | args | Packed args | tuple | | No | + +----------+----------------------+-----------+-------------+----------+ + | kwargs | Packed kwargs | dict | | No | + +----------+----------------------+-----------+-------------+----------+ + )doc"); + + auto pyTensor = static_cast>(m_tensor.attr("Tensor")); + pyTensor.def(py::init()) + .def( + py::init<>([](std::vector &&data, + const std::array &shape, + DataType data_type, + Layout layout) { + auto owned_buffer = detail::create_owned_buffer_from_vector_of_floats(std::move(data), data_type); + return Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); + }), + py::return_value_policy::move, R"doc( - Decorate external operation for purposes of reporting and profiling. - - +----------+----------------------+-----------+-------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +==========+======================+===========+=============+==========+ - | function | Fallback Operation | Function | | Yes | - +----------+----------------------+-----------+-------------+----------+ - | args | Packed args | tuple | | No | - +----------+----------------------+-----------+-------------+----------+ - | kwargs | Packed kwargs | dict | | No | - +----------+----------------------+-----------+-------------+----------+ - )doc"); - - auto pyTensor = static_cast>(m_tensor.attr("Tensor")); - pyTensor.def(py::init()) - .def( - py::init<>([](std::vector &&data, - const std::array &shape, - DataType data_type, - Layout layout) { - auto owned_buffer = detail::create_owned_buffer_from_vector_of_floats(std::move(data), data_type); - return Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); - }), - py::return_value_policy::move, - R"doc( - +---------------+---------------+ - | Argument | Name | - +===============+===============+ - | arg0 | data | - +---------------+---------------+ - | arg1 | shape | - +---------------+---------------+ - | arg2 | data_type | - +---------------+---------------+ - | arg3 | layout | - +---------------+---------------+ - - Example of creating a TT Tensor on host: - - .. code-block:: python - - py_tensor = torch.randn((1, 1, 32, 32)) - ttnn.Tensor( - py_tensor.reshape(-1).tolist(), - py_tensor.size(), - ttnn.DataType.BFLOAT16, - ttnn.Layout.ROW_MAJOR, - ) - )doc") - .def( - py::init<>([](std::vector &&data, - const std::array &shape, - DataType data_type, - Layout layout, - Device *device) { - auto owned_buffer = detail::create_owned_buffer_from_vector_of_floats(std::move(data), data_type); - auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); - return tensor.to(device, MemoryConfig{}); - }), - py::keep_alive<1, 6>(), - py::return_value_policy::move, - R"doc( - +---------------+---------------+ - | Argument | Name | - +===============+===============+ - | arg0 | data | - +---------------+---------------+ - | arg1 | shape | - +---------------+---------------+ - | arg2 | data_type | - +---------------+---------------+ - | arg3 | layout | - +---------------+---------------+ - | arg3 | device | - +---------------+---------------+ - - Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - - Note that TT Tensor in ROW_MAJOR layout on TT Accelerator device must have size of last dimension divisble by 2. - - Example of creating a TT Tensor on TT accelerator device: - - .. code-block:: python - - py_tensor = torch.randn((1, 1, 32, 32)) - tt_device = ttnn.CreateDevice(0) - // ... - ttnn.Tensor( - py_tensor.reshape(-1).tolist(), - py_tensor.size(), - ttnn.DataType.BFLOAT16, - ttnn.Layout.ROW_MAJOR, - tt_device - ) - )doc") - .def( - py::init<>([](std::vector &&data, - const std::array &shape, - DataType data_type, - Layout layout, - Device *device, - const MemoryConfig &memory_config) { - auto owned_buffer = detail::create_owned_buffer_from_vector_of_floats(std::move(data), data_type); - auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); - return tensor.to(device, memory_config); - }), - py::keep_alive<1, 6>(), - py::return_value_policy::move, - R"doc( - +---------------+---------------+ - | Argument | Name | - +===============+===============+ - | arg0 | data | - +---------------+---------------+ - | arg1 | shape | - +---------------+---------------+ - | arg2 | data_type | - +---------------+---------------+ - | arg3 | layout | - +---------------+---------------+ - | arg4 | device | - +---------------+---------------+ - | arg5 | mem_config | - +---------------+---------------+ - - Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - - Note that TT Tensor in ROW_MAJOR layout on TT Accelerator device must have size of last dimension divisble by 2. - - Example of creating a TT Tensor on TT accelerator device with specified mem_config: - - .. code-block:: python - - py_tensor = torch.randn((1, 1, 32, 32)) - tt_device = ttnn.CreateDevice(0) - mem_config = ttnn.MemoryConfig(ttnn.TensorMemoryLayout.SINGLE_BANK) - // ... - ttnn.Tensor( - py_tensor.reshape(-1).tolist(), - py_tensor.size(), - ttnn.DataType.BFLOAT16, - ttnn.Layout.ROW_MAJOR, - tt_device, - mem_config - ) - )doc") - .def( - py::init<>([](const py::object &tensor, - std::optional data_type, - const std::unordered_map &strategy) { - if (py::isinstance(tensor)) { - return detail::convert_python_tensors_to_tt_tensors(tensor, data_type, strategy); - } - return detail::convert_python_tensor_to_tt_tensor(tensor, data_type); - }), - py::arg("tensor"), - py::arg("data_type") = std::nullopt, - py::arg("strategy") = std::unordered_map(), - py::return_value_policy::move, - R"doc( - +--------------+------------------------+ - | Argument | Description | - +==============+========================+ - | tensor | Pytorch or Numpy Tensor| - +--------------+------------------------+ - | data_type | TT Tensor data type | - +--------------+------------------------+ - - Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage: - - .. code-block:: python - - py_tensor = torch.randn((1, 1, 32, 32)) - ttnn.Tensor(py_tensor) - )doc") - .def( - py::init<>([](const py::object &python_tensor, - std::optional data_type, - Device *device, - Layout layout, - const MemoryConfig &mem_config) { - auto tensor = detail::convert_python_tensor_to_tt_tensor(python_tensor, data_type); - auto layout_tensor = tensor.to(layout); - return layout_tensor.to(device, mem_config); - }), - py::arg("tensor"), - py::arg("data_type") = std::nullopt, - py::arg("device").noconvert(), - py::arg("layout").noconvert(), - py::arg("mem_config").noconvert(), - py::return_value_policy::move, - R"doc( - +--------------+------------------------+ - | Argument | Description | - +==============+========================+ - | tensor | Pytorch or Numpy Tensor| - +--------------+------------------------+ - | data_type | TT Tensor data type | - +--------------+------------------------+ - | device | TT device ptr | - +--------------+------------------------+ - | layout | TT layout | - +--------------+------------------------+ - | mem_config | TT memory_config | - +--------------+------------------------+ - - - Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage: - - .. code-block:: python - - py_tensor = np.zeros((1, 1, 32, 32)) - ttnn.Tensor(py_tensor) - )doc") - .def_property_readonly("shape", [](const Tensor &self) { return self.get_shape(); }) - .def_property_readonly("dtype", [](const Tensor &self) { return self.get_dtype(); }) - .def_property_readonly("layout", [](const Tensor &self) { return self.get_layout(); }) - .def( - "deallocate", - [](Tensor &self, bool force) { return self.deallocate(force); }, - py::arg("force") = false, - R"doc( - Dellocates all data of a tensor. This either deletes all host data or deallocates tensor data from device memory. - )doc") - .def( - "to", - py::overload_cast(&Tensor::to, py::const_), - py::arg("device").noconvert(), - py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, - py::keep_alive<0, 2>(), - R"doc( - Move TT Tensor from host device to TT accelerator device. - - Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - - If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. - - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +===========+=================================================+============================+=======================+==========+ - | arg0 | Device to which tensor will be moved | ttnn.Device | TT accelerator device | Yes | - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - | arg1 | MemoryConfig of tensor of TT accelerator device | ttnn.MemoryConfig | | No | - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + +---------------+---------------+ + | Argument | Name | + +===============+===============+ + | arg0 | data | + +---------------+---------------+ + | arg1 | shape | + +---------------+---------------+ + | arg2 | data_type | + +---------------+---------------+ + | arg3 | layout | + +---------------+---------------+ + + Example of creating a TT Tensor on host: .. code-block:: python - tt_tensor = tt_tensor.to(tt_device) + py_tensor = torch.randn((1, 1, 32, 32)) + ttnn.Tensor( + py_tensor.reshape(-1).tolist(), + py_tensor.size(), + ttnn.DataType.BFLOAT16, + ttnn.Layout.ROW_MAJOR, + ) )doc") - .def( - "track_ref_count", - [](Tensor &self) { return self.track_ref_count(); }, - R"doc( - Log the reference count (as seen by the main and worker threads) of a tensor as it evolves during runtime. - )doc") - .def( - "to", - py::overload_cast(&Tensor::to, py::const_), - py::arg("mesh_device").noconvert(), - py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, - py::keep_alive<0, 2>(), - R"doc( - Move TT Tensor from host device to TT accelerator device. + .def( + py::init<>([](std::vector &&data, + const std::array &shape, + DataType data_type, + Layout layout, + Device *device) { + auto owned_buffer = detail::create_owned_buffer_from_vector_of_floats(std::move(data), data_type); + auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); + return tensor.to(device, MemoryConfig{}); + }), + py::keep_alive<1, 6>(), + py::return_value_policy::move, + R"doc( + +---------------+---------------+ + | Argument | Name | + +===============+===============+ + | arg0 | data | + +---------------+---------------+ + | arg1 | shape | + +---------------+---------------+ + | arg2 | data_type | + +---------------+---------------+ + | arg3 | layout | + +---------------+---------------+ + | arg3 | device | + +---------------+---------------+ Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. + Note that TT Tensor in ROW_MAJOR layout on TT Accelerator device must have size of last dimension divisble by 2. - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +===========+=================================================+============================+=======================+==========+ - | arg0 | MeshDevice to which tensor will be moved | ttnn.MeshDevice | TT accelerator device | Yes | - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - | arg1 | MemoryConfig of tensor of TT accelerator device | ttnn.MemoryConfig | | No | - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + Example of creating a TT Tensor on TT accelerator device: .. code-block:: python - tt_tensor = tt_tensor.to(tt_device) + py_tensor = torch.randn((1, 1, 32, 32)) + tt_device = ttnn.CreateDevice(0) + // ... + ttnn.Tensor( + py_tensor.reshape(-1).tolist(), + py_tensor.size(), + ttnn.DataType.BFLOAT16, + ttnn.Layout.ROW_MAJOR, + tt_device + ) )doc") - .def("sync", [](Tensor &self) { return self.wait_for_tensor_data_populated(); }) - .def( - "extract_shard", - [](const Tensor &self, CoreCoord core) { return self.extract_shard(core); }, - py::arg("core").noconvert(), - py::keep_alive<0, 2>(), - R"doc( - Move TT Tensor from host device to TT accelerator device. + .def( + py::init<>([](std::vector &&data, + const std::array &shape, + DataType data_type, + Layout layout, + Device *device, + const MemoryConfig &memory_config) { + auto owned_buffer = detail::create_owned_buffer_from_vector_of_floats(std::move(data), data_type); + auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); + return tensor.to(device, memory_config); + }), + py::keep_alive<1, 6>(), + py::return_value_policy::move, + R"doc( + +---------------+---------------+ + | Argument | Name | + +===============+===============+ + | arg0 | data | + +---------------+---------------+ + | arg1 | shape | + +---------------+---------------+ + | arg2 | data_type | + +---------------+---------------+ + | arg3 | layout | + +---------------+---------------+ + | arg4 | device | + +---------------+---------------+ + | arg5 | mem_config | + +---------------+---------------+ Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. - - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +===========+=================================================+============================+=======================+==========+ - | arg0 | Core who's shard we want | ttnn.CoreCoord | TT accelerator device | Yes | - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + Note that TT Tensor in ROW_MAJOR layout on TT Accelerator device must have size of last dimension divisble by 2. + Example of creating a TT Tensor on TT accelerator device with specified mem_config: .. code-block:: python - tt_tensor = tt_tensor.to(tt_device) + py_tensor = torch.randn((1, 1, 32, 32)) + tt_device = ttnn.CreateDevice(0) + mem_config = ttnn.MemoryConfig(ttnn.TensorMemoryLayout.SINGLE_BANK) + // ... + ttnn.Tensor( + py_tensor.reshape(-1).tolist(), + py_tensor.size(), + ttnn.DataType.BFLOAT16, + ttnn.Layout.ROW_MAJOR, + tt_device, + mem_config + ) )doc") - .def( - "extract_shard", - [](const Tensor &self, const uint32_t &core_id) { return self.extract_shard(core_id); }, - py::arg("core_id").noconvert(), - py::keep_alive<0, 2>(), - R"doc( - Move TT Tensor from host device to TT accelerator device. - - Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - - If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. - - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +===========+=================================================+============================+=======================+==========+ - | arg0 | Core who's shard we want | uint32_t | TT accelerator device | Yes | - +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + .def( + py::init<>([](const py::object &tensor, + std::optional data_type, + const std::unordered_map &strategy) { + if (py::isinstance(tensor)) { + return detail::convert_python_tensors_to_tt_tensors(tensor, data_type, strategy); + } + return detail::convert_python_tensor_to_tt_tensor(tensor, data_type); + }), + py::arg("tensor"), + py::arg("data_type") = std::nullopt, + py::arg("strategy") = std::unordered_map(), + py::return_value_policy::move, + R"doc( + +--------------+------------------------+ + | Argument | Description | + +==============+========================+ + | tensor | Pytorch or Numpy Tensor| + +--------------+------------------------+ + | data_type | TT Tensor data type | + +--------------+------------------------+ + Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage: .. code-block:: python - tt_tensor = tt_tensor.to(tt_device) + py_tensor = torch.randn((1, 1, 32, 32)) + ttnn.Tensor(py_tensor) )doc") - .def( - "cpu", - [](const Tensor &self, bool blocking) { return self.cpu(blocking); }, - py::arg("blocking") = true, - R"doc( - Move TT Tensor from TT accelerator device to host device. + .def( + py::init<>([](const py::object &python_tensor, + std::optional data_type, + Device *device, + Layout layout, + const MemoryConfig &mem_config) { + auto tensor = detail::convert_python_tensor_to_tt_tensor(python_tensor, data_type); + auto layout_tensor = tensor.to(layout); + return layout_tensor.to(device, mem_config); + }), + py::arg("tensor"), + py::arg("data_type") = std::nullopt, + py::arg("device").noconvert(), + py::arg("layout").noconvert(), + py::arg("mem_config").noconvert(), + py::return_value_policy::move, + R"doc( + +--------------+------------------------+ + | Argument | Description | + +==============+========================+ + | tensor | Pytorch or Numpy Tensor| + +--------------+------------------------+ + | data_type | TT Tensor data type | + +--------------+------------------------+ + | device | TT device ptr | + +--------------+------------------------+ + | layout | TT layout | + +--------------+------------------------+ + | mem_config | TT memory_config | + +--------------+------------------------+ + + + Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage: .. code-block:: python - tt_tensor = tt_tensor.cpu() + py_tensor = np.zeros((1, 1, 32, 32)) + ttnn.Tensor(py_tensor) )doc") - .def("cpu_sharded", &Tensor::cpu_sharded, R"doc( - Move TT Tensor from TT accelerator device to host device in sharded orientation. - - .. code-block:: python - - tt_tensor = tt_tensor.cpu_sharded() + .def_property_readonly("shape", [](const Tensor &self) { return self.get_shape(); }) + .def_property_readonly("dtype", [](const Tensor &self) { return self.get_dtype(); }) + .def_property_readonly("layout", [](const Tensor &self) { return self.get_layout(); }) + .def( + "deallocate", + [](Tensor &self, bool force) { return self.deallocate(force); }, + py::arg("force") = false, + R"doc( + Dellocates all data of a tensor. This either deletes all host data or deallocates tensor data from device memory. )doc") - .def( - "to", - py::overload_cast(&Tensor::to, py::const_), - py::arg("target_layout").noconvert(), - py::arg("worker") = nullptr, - R"doc( - Convert TT Tensor to provided memory layout. Available layouts conversions are: - - * ROW_MAJOR to TILE - * TILE to ROW_MAJOR - - +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +===========+=================================================+============================+================================+==========+ - | arg0 | Target memory layout | ttnn.Layout | ROW_MAJOR, TILE | Yes | - +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ - | arg1 | Worker thread performing layout conversion | ttnn.Device | Thread tied to TT accelerator | No | - | | (optional) | | device | | - +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ + .def( + "to", + py::overload_cast(&Tensor::to, py::const_), + py::arg("device").noconvert(), + py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, + py::keep_alive<0, 2>(), + R"doc( + Move TT Tensor from host device to TT accelerator device. - .. code-block:: python + Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - tt_tensor = tt_tensor.to(ttnn.Layout.TILE, worker) - )doc") - .def( - "to", - py::overload_cast(&Tensor::to, py::const_), - py::arg("target_layout").noconvert(), - py::arg("mesh_device") = nullptr, - R"doc( - Convert TT Tensor to provided memory layout. Available layouts conversions are: - - * ROW_MAJOR to TILE - * TILE to ROW_MAJOR - - +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +===========+=================================================+============================+================================+==========+ - | arg0 | Target memory layout | ttnn.Layout | ROW_MAJOR, TILE | Yes | - +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ - | arg1 | Worker thread performing layout conversion | ttnn.Device | Thread tied to TT accelerator | No | - | | (optional) | | device | | - +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ + If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. - .. code-block:: python + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +===========+=================================================+============================+=======================+==========+ + | arg0 | Device to which tensor will be moved | ttnn.Device | TT accelerator device | Yes | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | arg1 | MemoryConfig of tensor of TT accelerator device | ttnn.MemoryConfig | | No | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - tt_tensor = tt_tensor.to(ttnn.Layout.TILE, mesh_device) - )doc") - .def( - "pad", - [](const Tensor &self, - const std::array &output_tensor_shape, - const std::array &input_tensor_start, - float pad_value) { return self.pad(output_tensor_shape, input_tensor_start, pad_value); }, - R"doc( - Pad TT Tensor with given pad value ``arg2``. - - The input tensor must be on host and in ROW_MAJOR layout. - - Returns an output tensor that contains the input tensor at the given input tensor start indices ``arg1`` and the padded value everywhere else. - - +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +=====================+======================================================+==============+=====================================================+==========+ - | arg0 | Shape of output tensor | List[int[4]] | | Yes | - +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ - | arg1 | Start indices to place input tensor in output tensor | List[int[4]] | Values along each dim must be | Yes | - | | | | | | - | | | | <= (output_tensor_shape[i] - input_tensor_shape[i]) | | - +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ - | arg2 | Value to pad input tensor | float | | Yes | - +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + .. code-block:: python - .. code-block:: python + tt_tensor = tt_tensor.to(tt_device) + )doc") + .def( + "track_ref_count", + [](Tensor &self) { return self.track_ref_count(); }, + R"doc( + Log the reference count (as seen by the main and worker threads) of a tensor as it evolves during runtime. + )doc") + .def( + "to", + py::overload_cast(&Tensor::to, py::const_), + py::arg("mesh_device").noconvert(), + py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, + py::keep_alive<0, 2>(), + R"doc( + Move TT Tensor from host device to TT accelerator device. - input_tensor_shape = [1, 1, 3, 3] - output_tensor_shape = [1, 2, 5, 5] - input_tensor_start = [0, 1, 1, 1] - pad_value = 0 + Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - inp = torch.Tensor( - [ 1, 2, 3, - 4, 5, 6, - 7, 8, 9 ] - ) - tt_tensor = ttnn.Tensor( - inp.tolist(), - input_tensor_shape, - ttnn.DataType.BFLOAT16, - ttnn.Layout.ROW_MAJOR, - ) - tt_tensor_padded = tt_tensor.pad(output_tensor_shape, input_tensor_start, pad_value) - - print("Input tensor:") - print(tt_tensor) - print("\nPadded tensor:") - print(tt_tensor_padded) - - Example output: - - .. code-block:: - - Input tensor: - [ [[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]]] dtype=bfloat16 ] - - Padded tensor: - [ [[[0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]], - - [[0, 0, 0, 0, 0], - [0, 1, 2, 3, 0], - [0, 4, 5, 6, 0], - [0, 7, 8, 9, 0], - [0, 0, 0, 0, 0]]] dtype=bfloat16 ] - )doc") - .def( - "unpad", - [](const Tensor &self, - const std::array &output_tensor_start, - const std::array &output_tensor_end) { - return self.unpad(output_tensor_start, output_tensor_end); - }, - R"doc( - Unpad this TT Tensor. - - This tensor must be on host and in ROW_MAJOR layout. - - Returns an output tensor from output tensor start indices ``arg0`` to output tensor end indices ``arg1`` (inclusive) of the input tensor. - - +---------------------+----------------------------------------------+--------------+-----------------------------------------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +=====================+==============================================+==============+=====================================================+==========+ - | arg0 | Start indices of input tensor | List[int[4]] | Values along each dim must be | Yes | - | | | | | | - | | | | < input_tensor_shape[i] and <= output_tensor_end[i] | | - +---------------------+----------------------------------------------+--------------+-----------------------------------------------------+----------+ - | arg1 | End indices of input tensor in output tensor | List[int[4]] | Values along each dim must be | Yes | - | | | | | | - | | | | < input_tensor_shape[i] | | - +---------------------+----------------------------------------------+--------------+-----------------------------------------------------+----------+ + If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. - .. code-block:: python + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +===========+=================================================+============================+=======================+==========+ + | arg0 | MeshDevice to which tensor will be moved | ttnn.MeshDevice | TT accelerator device | Yes | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | arg1 | MemoryConfig of tensor of TT accelerator device | ttnn.MemoryConfig | | No | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - input_tensor_shape = [1, 1, 5, 5] - output_tensor_start = [0, 0, 1, 1] - output_tensor_end = [0, 0, 3, 3] + .. code-block:: python - inp = torch.Tensor( - [ 0, 0, 0, 0, 0, - 0, 1, 2, 3, 0, - 0, 4, 5, 6, 0, - 0, 7, 8, 9, 0, - 0, 0, 0, 0, 0 ] - ) - tt_tensor = ttnn.Tensor( - inp.tolist(), - input_tensor_shape, - ttnn.DataType.BFLOAT16, - ttnn.Layout.ROW_MAJOR, - ) - tt_tensor_unpadded = tt_tensor.unpad(output_tensor_start, output_tensor_end) + tt_tensor = tt_tensor.to(tt_device) + )doc") + .def("sync", [](Tensor &self) { return self.wait_for_tensor_data_populated(); }) + .def( + "extract_shard", + [](const Tensor &self, CoreCoord core) { return self.extract_shard(core); }, + py::arg("core").noconvert(), + py::keep_alive<0, 2>(), + R"doc( + Move TT Tensor from host device to TT accelerator device. - print("Input tensor:") - print(tt_tensor) - print("\nUnpadded tensor:") - print(tt_tensor_unpadded) + Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - Example output: + If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. - .. code-block:: + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +===========+=================================================+============================+=======================+==========+ + | arg0 | Core who's shard we want | ttnn.CoreCoord | TT accelerator device | Yes | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - Input tensor: - [ [[[0, 0, 0, 0, 0], - [0, 1, 2, 3, 0], - [0, 4, 5, 6, 0], - [0, 7, 8, 9, 0], - [0, 0, 0, 0, 0]]] dtype=bfloat16 ] - Unpadded tensor: - [ [[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]]] dtype=bfloat16 ] - )doc") - .def( - "pad_to_tile", [](const Tensor &self, float pad_value) { return self.pad_to_tile(pad_value); }, R"doc( - Pads TT Tensor with given pad value ``arg0``. + .. code-block:: python - The input tensor must be on host and in ROW_MAJOR layout. + tt_tensor = tt_tensor.to(tt_device) + )doc") + .def( + "extract_shard", + [](const Tensor &self, const uint32_t &core_id) { return self.extract_shard(core_id); }, + py::arg("core_id").noconvert(), + py::keep_alive<0, 2>(), + R"doc( + Move TT Tensor from host device to TT accelerator device. - Returns an output tensor that contains the input tensor padded with the padded value in the last two dims to multiples of 32. + Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. - Padding will be added to the right and bottom of the tensor. + If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. - +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +=====================+======================================================+==============+=====================================================+==========+ - | arg0 | Value to pad input tensor | float | | Yes | - +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +===========+=================================================+============================+=======================+==========+ + | arg0 | Core who's shard we want | uint32_t | TT accelerator device | Yes | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ - .. code-block:: python - input_tensor_shape = [1, 1, 3, 3] - pad_value = 0 + .. code-block:: python - inp = torch.Tensor( - [ 1, 2, 3, - 4, 5, 6, - 7, 8, 9 ] - ) - tt_tensor = ttnn.Tensor( - inp.tolist(), - input_tensor_shape, - ttnn.DataType.BFLOAT16, - ttnn.Layout.ROW_MAJOR, - ) - tt_tensor_padded = tt_tensor.pad_to_tile(pad_value) + tt_tensor = tt_tensor.to(tt_device) + )doc") + .def( + "cpu", + [](const Tensor &self, bool blocking) { return self.cpu(blocking); }, + py::arg("blocking") = true, + R"doc( + Move TT Tensor from TT accelerator device to host device. - print("Input tensor:") - print(tt_tensor) - print("\nPadded tensor:") - print(tt_tensor_padded) + .. code-block:: python - Example output: + tt_tensor = tt_tensor.cpu() + )doc") + .def("cpu_sharded", &Tensor::cpu_sharded, R"doc( + Move TT Tensor from TT accelerator device to host device in sharded orientation. - .. code-block:: + .. code-block:: python - Input tensor: - [ [[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]]] dtype=bfloat16 ] + tt_tensor = tt_tensor.cpu_sharded() + )doc") + .def( + "to", + py::overload_cast(&Tensor::to, py::const_), + py::arg("target_layout").noconvert(), + py::arg("worker") = nullptr, + R"doc( + Convert TT Tensor to provided memory layout. Available layouts conversions are: - Padded tensor: - [ [[[1, 2, 3, 0, ..., 0], - [4, 5, 6, 0, ..., 0], - [7, 8, 9, 0, ..., 0], - [0, 0, 0, 0, ..., 0], - ..., - [0, 0, 0, 0, ..., 0]]] dtype=bfloat16 ] - )doc") - .def( - "unpad_from_tile", - [](const Tensor &self, const std::vector &output_tensor_shape) { - return self.unpad_from_tile(output_tensor_shape); - }, - R"doc( - Unpads TT Tensor from given input tensor ``arg0``. + * ROW_MAJOR to TILE + * TILE to ROW_MAJOR - The input tensor must be on host and in ROW_MAJOR layout. + +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +===========+=================================================+============================+================================+==========+ + | arg0 | Target memory layout | ttnn.Layout | ROW_MAJOR, TILE | Yes | + +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ + | arg1 | Worker thread performing layout conversion | ttnn.Device | Thread tied to TT accelerator | No | + | | (optional) | | device | | + +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ - This function expects the real data to aligned on the top left of the tensor. + .. code-block:: python - Returns an output tensor with padding removed from the right and bottom of the input tensor. + tt_tensor = tt_tensor.to(ttnn.Layout.TILE, worker) + )doc") + .def( + "to", + py::overload_cast(&Tensor::to, py::const_), + py::arg("target_layout").noconvert(), + py::arg("mesh_device") = nullptr, + R"doc( + Convert TT Tensor to provided memory layout. Available layouts conversions are: - +---------------------+----------------------------------------------+--------------+------------------------------------------------------------------------------+----------+ - | Argument | Description | Data type | Valid range | Required | - +=====================+==============================================+==============+==============================================================================+==========+ - | arg0 | Shape of output tensor | List[int[4]] | All dims must match the input tensor dims apart from the last two dims. | Yes | - | | | | | | - | | | | Last two dims have the following restrictions: | | - | | | | | | - | | | | input_tensor_shape[i] must be a multiple of 32 | | - | | | | | | - | | | | input_tensor_shape[i] - 32 < output_tensor_shape[i] <= input_tensor_shape[i] | | - +---------------------+----------------------------------------------+--------------+------------------------------------------------------------------------------+----------+ + * ROW_MAJOR to TILE + * TILE to ROW_MAJOR + +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +===========+=================================================+============================+================================+==========+ + | arg0 | Target memory layout | ttnn.Layout | ROW_MAJOR, TILE | Yes | + +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ + | arg1 | Worker thread performing layout conversion | ttnn.Device | Thread tied to TT accelerator | No | + | | (optional) | | device | | + +-----------+-------------------------------------------------+----------------------------+--------------------------------+----------+ - .. code-block:: python + .. code-block:: python - input_tensor_shape = [1, 1, 32, 32] - output_tensor_shape = [1, 1, 3, 3] + tt_tensor = tt_tensor.to(ttnn.Layout.TILE, mesh_device) + )doc") + .def( + "pad", + [](const Tensor &self, + const std::array &output_tensor_shape, + const std::array &input_tensor_start, + float pad_value) { return self.pad(output_tensor_shape, input_tensor_start, pad_value); }, + R"doc( + Pad TT Tensor with given pad value ``arg2``. + + The input tensor must be on host and in ROW_MAJOR layout. + + Returns an output tensor that contains the input tensor at the given input tensor start indices ``arg1`` and the padded value everywhere else. + + +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +=====================+======================================================+==============+=====================================================+==========+ + | arg0 | Shape of output tensor | List[int[4]] | | Yes | + +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + | arg1 | Start indices to place input tensor in output tensor | List[int[4]] | Values along each dim must be | Yes | + | | | | | | + | | | | <= (output_tensor_shape[i] - input_tensor_shape[i]) | | + +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + | arg2 | Value to pad input tensor | float | | Yes | + +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + + .. code-block:: python + + input_tensor_shape = [1, 1, 3, 3] + output_tensor_shape = [1, 2, 5, 5] + input_tensor_start = [0, 1, 1, 1] + pad_value = 0 + + inp = torch.Tensor( + [ 1, 2, 3, + 4, 5, 6, + 7, 8, 9 ] + ) + tt_tensor = ttnn.Tensor( + inp.tolist(), + input_tensor_shape, + ttnn.DataType.BFLOAT16, + ttnn.Layout.ROW_MAJOR, + ) + tt_tensor_padded = tt_tensor.pad(output_tensor_shape, input_tensor_start, pad_value) + + print("Input tensor:") + print(tt_tensor) + print("\nPadded tensor:") + print(tt_tensor_padded) + + Example output: + + .. code-block:: + + Input tensor: + [ [[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]] dtype=bfloat16 ] + + Padded tensor: + [ [[[0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]], + + [[0, 0, 0, 0, 0], + [0, 1, 2, 3, 0], + [0, 4, 5, 6, 0], + [0, 7, 8, 9, 0], + [0, 0, 0, 0, 0]]] dtype=bfloat16 ] + )doc") + .def( + "unpad", + [](const Tensor &self, + const std::array &output_tensor_start, + const std::array &output_tensor_end) { + return self.unpad(output_tensor_start, output_tensor_end); + }, + R"doc( + Unpad this TT Tensor. + + This tensor must be on host and in ROW_MAJOR layout. + + Returns an output tensor from output tensor start indices ``arg0`` to output tensor end indices ``arg1`` (inclusive) of the input tensor. + + +---------------------+----------------------------------------------+--------------+-----------------------------------------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +=====================+==============================================+==============+=====================================================+==========+ + | arg0 | Start indices of input tensor | List[int[4]] | Values along each dim must be | Yes | + | | | | | | + | | | | < input_tensor_shape[i] and <= output_tensor_end[i] | | + +---------------------+----------------------------------------------+--------------+-----------------------------------------------------+----------+ + | arg1 | End indices of input tensor in output tensor | List[int[4]] | Values along each dim must be | Yes | + | | | | | | + | | | | < input_tensor_shape[i] | | + +---------------------+----------------------------------------------+--------------+-----------------------------------------------------+----------+ + + .. code-block:: python + + input_tensor_shape = [1, 1, 5, 5] + output_tensor_start = [0, 0, 1, 1] + output_tensor_end = [0, 0, 3, 3] + + inp = torch.Tensor( + [ 0, 0, 0, 0, 0, + 0, 1, 2, 3, 0, + 0, 4, 5, 6, 0, + 0, 7, 8, 9, 0, + 0, 0, 0, 0, 0 ] + ) + tt_tensor = ttnn.Tensor( + inp.tolist(), + input_tensor_shape, + ttnn.DataType.BFLOAT16, + ttnn.Layout.ROW_MAJOR, + ) + tt_tensor_unpadded = tt_tensor.unpad(output_tensor_start, output_tensor_end) + + print("Input tensor:") + print(tt_tensor) + print("\nUnpadded tensor:") + print(tt_tensor_unpadded) + + Example output: + + .. code-block:: + + Input tensor: + [ [[[0, 0, 0, 0, 0], + [0, 1, 2, 3, 0], + [0, 4, 5, 6, 0], + [0, 7, 8, 9, 0], + [0, 0, 0, 0, 0]]] dtype=bfloat16 ] + + Unpadded tensor: + [ [[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]] dtype=bfloat16 ] + )doc") + .def( + "pad_to_tile", [](const Tensor &self, float pad_value) { return self.pad_to_tile(pad_value); }, R"doc( + Pads TT Tensor with given pad value ``arg0``. + + The input tensor must be on host and in ROW_MAJOR layout. + + Returns an output tensor that contains the input tensor padded with the padded value in the last two dims to multiples of 32. + + Padding will be added to the right and bottom of the tensor. + + +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +=====================+======================================================+==============+=====================================================+==========+ + | arg0 | Value to pad input tensor | float | | Yes | + +---------------------+------------------------------------------------------+--------------+-----------------------------------------------------+----------+ + + .. code-block:: python + + input_tensor_shape = [1, 1, 3, 3] + pad_value = 0 + + inp = torch.Tensor( + [ 1, 2, 3, + 4, 5, 6, + 7, 8, 9 ] + ) + tt_tensor = ttnn.Tensor( + inp.tolist(), + input_tensor_shape, + ttnn.DataType.BFLOAT16, + ttnn.Layout.ROW_MAJOR, + ) + tt_tensor_padded = tt_tensor.pad_to_tile(pad_value) + + print("Input tensor:") + print(tt_tensor) + print("\nPadded tensor:") + print(tt_tensor_padded) + + Example output: + + .. code-block:: + + Input tensor: + [ [[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]] dtype=bfloat16 ] + + Padded tensor: + [ [[[1, 2, 3, 0, ..., 0], + [4, 5, 6, 0, ..., 0], + [7, 8, 9, 0, ..., 0], + [0, 0, 0, 0, ..., 0], + ..., + [0, 0, 0, 0, ..., 0]]] dtype=bfloat16 ] + )doc") + .def( + "unpad_from_tile", + [](const Tensor &self, const std::vector &output_tensor_shape) { + return self.unpad_from_tile(output_tensor_shape); + }, + R"doc( + Unpads TT Tensor from given input tensor ``arg0``. + + The input tensor must be on host and in ROW_MAJOR layout. + + This function expects the real data to aligned on the top left of the tensor. + + Returns an output tensor with padding removed from the right and bottom of the input tensor. + + +---------------------+----------------------------------------------+--------------+------------------------------------------------------------------------------+----------+ + | Argument | Description | Data type | Valid range | Required | + +=====================+==============================================+==============+==============================================================================+==========+ + | arg0 | Shape of output tensor | List[int[4]] | All dims must match the input tensor dims apart from the last two dims. | Yes | + | | | | | | + | | | | Last two dims have the following restrictions: | | + | | | | | | + | | | | input_tensor_shape[i] must be a multiple of 32 | | + | | | | | | + | | | | input_tensor_shape[i] - 32 < output_tensor_shape[i] <= input_tensor_shape[i] | | + +---------------------+----------------------------------------------+--------------+------------------------------------------------------------------------------+----------+ + + + .. code-block:: python + + input_tensor_shape = [1, 1, 32, 32] + output_tensor_shape = [1, 1, 3, 3] + + inp = torch.arange(start=1.0, end=10.0).reshape(1, 1, 3, 3) + inp = torch.nn.functional.pad(inp, [0, input_tensor_shape[3] - inp.shape[3], 0, input_tensor_shape[2] - inp.shape[2]]).reshape(-1) + tt_tensor = ttnn.Tensor( + inp.tolist(), + input_tensor_shape, + ttnn.DataType.BFLOAT16, + ttnn.Layout.ROW_MAJOR, + ) + tt_tensor_unpadded = tt_tensor.unpad_from_tile(output_tensor_shape) + + print("Input tensor:") + print(tt_tensor) + print("\nUnpadded tensor:") + print(tt_tensor_unpadded) + + Example output: + + .. code-block:: + + Input tensor: + [ [[[1, 2, 3, 0, ..., 0], + [4, 5, 6, 0, ..., 0], + [7, 8, 9, 0, ..., 0], + [0, 0, 0, 0, ..., 0], + ..., + [0, 0, 0, 0, ..., 0]]] dtype=bfloat16 ] + + Unpadded tensor: + [ [[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]] dtype=bfloat16 ] + )doc") + .def( + "__repr__", [](const Tensor &self) { return self.write_to_string(); }, R"doc( + Prints the tensor as list of nested lists. Number of levels of nesting is equal to tensor rank. - inp = torch.arange(start=1.0, end=10.0).reshape(1, 1, 3, 3) - inp = torch.nn.functional.pad(inp, [0, input_tensor_shape[3] - inp.shape[3], 0, input_tensor_shape[2] - inp.shape[2]]).reshape(-1) - tt_tensor = ttnn.Tensor( - inp.tolist(), - input_tensor_shape, - ttnn.DataType.BFLOAT16, - ttnn.Layout.ROW_MAJOR, - ) - tt_tensor_unpadded = tt_tensor.unpad_from_tile(output_tensor_shape) + .. code-block:: python - print("Input tensor:") - print(tt_tensor) - print("\nUnpadded tensor:") - print(tt_tensor_unpadded) + print(tt_tensor) - Example output: + Example output for a rank 4 TT Tensor with shape (1, 1, 32, 32): - .. code-block:: + .. code-block:: - Input tensor: - [ [[[1, 2, 3, 0, ..., 0], - [4, 5, 6, 0, ..., 0], - [7, 8, 9, 0, ..., 0], - [0, 0, 0, 0, ..., 0], - ..., - [0, 0, 0, 0, ..., 0]]] dtype=bfloat16 ] + [ [[[0.220703, 0.839844, 0.960938, ..., 0.378906, 0.507812], + [0.03125, 0.511719, 0.0407715, ..., 0.945312, 0.671875], + ... + [0.433594, 0.165039, 0.980469, ..., , 0.349609]]] dtype=bfloat16 ] - Unpadded tensor: - [ [[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]]] dtype=bfloat16 ] - )doc") - .def( - "__repr__", [](const Tensor &self) { return self.write_to_string(); }, R"doc( - Prints the tensor as list of nested lists. Number of levels of nesting is equal to tensor rank. + )doc") + .def( + "get_legacy_shape", + [](const Tensor &self) { return self.get_legacy_shape(); }, + R"doc( + Get the shape of the tensor as Shape class. - .. code-block:: python + .. code-block:: python - print(tt_tensor) + shape = tt_tensor.get_legacy_shape() - Example output for a rank 4 TT Tensor with shape (1, 1, 32, 32): + )doc") + .def( + "volume", [](const Tensor &self) { return self.volume(); }, R"doc( + Get the volume of the tensor. - .. code-block:: + .. code-block:: python - [ [[[0.220703, 0.839844, 0.960938, ..., 0.378906, 0.507812], - [0.03125, 0.511719, 0.0407715, ..., 0.945312, 0.671875], - ... - [0.433594, 0.165039, 0.980469, ..., , 0.349609]]] dtype=bfloat16 ] + volume = tt_tensor.volume() - )doc") - .def( - "get_legacy_shape", - [](const Tensor &self) { return self.get_legacy_shape(); }, - R"doc( - Get the shape of the tensor as Shape class. + )doc") + .def( + "storage_type", [](const Tensor &self) { return self.storage_type(); }, R"doc( + Check if the tensor is on host - .. code-block:: python + .. code-block:: python - shape = tt_tensor.get_legacy_shape() + storage_type = tt_tensor.storage_type() - )doc") - .def( - "volume", [](const Tensor &self) { return self.volume(); }, R"doc( - Get the volume of the tensor. + )doc") + .def( + "device", + [](const Tensor &self) { return self.device(); }, + R"doc( + Get the device of the tensor. - .. code-block:: python + .. code-block:: python - volume = tt_tensor.volume() + device = tt_tensor.device() - )doc") - .def( - "storage_type", [](const Tensor &self) { return self.storage_type(); }, R"doc( - Check if the tensor is on host + )doc", + py::return_value_policy::reference) + .def( + "devices", + [](const Tensor &self) { return self.get_workers(); }, + R"doc( + Get devices tensor is mapped on to. - .. code-block:: python + .. code-block:: python - storage_type = tt_tensor.storage_type() + devices = tt_tensor.devices() - )doc") + )doc", + py::return_value_policy::reference) .def( - "device", - [](const Tensor &self) { return self.device(); }, + "to_torch", + [](const Tensor &self) -> py::object { return detail::convert_tt_tensor_to_torch_tensor(self); }, R"doc( - Get the device of the tensor. + Convert tensor to torch tensor. - .. code-block:: python + The tensor must be on host when calling this function. + + .. code-block:: python - device = tt_tensor.device() + data = tt_tensor.cpu().to_torch() # move TT Tensor to host and convert it to torch tensor - )doc", - py::return_value_policy::reference) + )doc") .def( - "devices", - [](const Tensor &self) { return self.get_workers(); }, + "to_numpy", + [](const Tensor &self) -> py::object { return detail::convert_tt_tensor_to_numpy_tensor(self); }, R"doc( - Get devices tensor is mapped on to. - - .. code-block:: python + Convert tensor to numpy tensor. - devices = tt_tensor.devices() + The tensor must be on host when calling this function. - )doc", - py::return_value_policy::reference) - .def( - "to_torch", - [](const Tensor &self) -> py::object { return detail::convert_tt_tensor_to_torch_tensor(self); }, - R"doc( - Convert tensor to torch tensor. + .. code-block:: python - The tensor must be on host when calling this function. + data = tt_tensor.cpu().to_numpy() # move TT Tensor to host and convert it to numpy tensor - .. code-block:: python + )doc") + .def( + "buffer", + [](const Tensor &self) -> std::variant { + return std::visit( + [](auto &&storage) -> std::variant { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return storage.buffer; + } else if constexpr (std::is_same_v) { + TT_THROW("Device storage doesn't support buffer method"); + } else if constexpr (std::is_same_v) { + return storage.buffer; + } else if constexpr (std::is_same_v) { + TT_THROW("MultiDeviceStorage doesn't support buffer method"); + } else if constexpr (std::is_same_v) { + TT_THROW("MultiDeviceHostStorage doesn't support buffer method"); + } else { + raise_unsupported_storage(); + } + }, + self.get_storage()); + }, + R"doc( + Get the underlying buffer. - data = tt_tensor.cpu().to_torch() # move TT Tensor to host and convert it to torch tensor + The tensor must be on the cpu when calling this function. - )doc") - .def( - "to_numpy", - [](const Tensor &self) -> py::object { return detail::convert_tt_tensor_to_numpy_tensor(self); }, - R"doc( - Convert tensor to numpy tensor. + .. code-block:: python - The tensor must be on host when calling this function. + buffer = tt_tensor.cpu().buffer() # move TT Tensor to host and get the buffer - .. code-block:: python + )doc") + .def( + "buffer_address", + [](const Tensor &self) -> uint32_t { + return std::visit( + [](auto &&storage) -> uint32_t { + using T = std::decay_t; + if constexpr (std::is_same_v) { + TT_THROW("OwnedStorage doesn't support buffer_address method"); + } else if constexpr (std::is_same_v) { + return storage.buffer->address(); + } else if constexpr (std::is_same_v) { + TT_THROW("BorrowedStorage doesn't support buffer_address method"); + } else if constexpr (std::is_same_v) { + TT_THROW("MultiDeviceStorage doesn't support buffer_address method"); + } else if constexpr (std::is_same_v) { + TT_THROW("MultiDeviceHostStorage doesn't support buffer_address method"); + } else { + raise_unsupported_storage(); + } + }, + self.get_storage()); + }, + R"doc( + Get the address of the underlying buffer. - data = tt_tensor.cpu().to_numpy() # move TT Tensor to host and convert it to numpy tensor + The tensor must be on the single device when calling this function. - )doc") - .def( - "buffer", - [](const Tensor &self) -> std::variant { - return std::visit( - [](auto &&storage) -> std::variant { - using T = std::decay_t; - if constexpr (std::is_same_v) { - return storage.buffer; - } else if constexpr (std::is_same_v) { - TT_THROW("Device storage doesn't support buffer method"); - } else if constexpr (std::is_same_v) { - return storage.buffer; - } else if constexpr (std::is_same_v) { - TT_THROW("MultiDeviceStorage doesn't support buffer method"); - } else if constexpr (std::is_same_v) { - TT_THROW("MultiDeviceHostStorage doesn't support buffer method"); - } else { - raise_unsupported_storage(); - } - }, - self.get_storage()); - }, - R"doc( - Get the underlying buffer. - - The tensor must be on the cpu when calling this function. + .. code-block:: python - .. code-block:: python + address = tt_tensor.buffer_address() - buffer = tt_tensor.cpu().buffer() # move TT Tensor to host and get the buffer + )doc") + .def( + "get_layout", [](const Tensor &self) { return self.get_layout(); }, R"doc( + Get memory layout of TT Tensor. - )doc") - .def( - "buffer_address", - [](const Tensor &self) -> uint32_t { - return std::visit( - [](auto &&storage) -> uint32_t { - using T = std::decay_t; - if constexpr (std::is_same_v) { - TT_THROW("OwnedStorage doesn't support buffer_address method"); - } else if constexpr (std::is_same_v) { - return storage.buffer->address(); - } else if constexpr (std::is_same_v) { - TT_THROW("BorrowedStorage doesn't support buffer_address method"); - } else if constexpr (std::is_same_v) { - TT_THROW("MultiDeviceStorage doesn't support buffer_address method"); - } else if constexpr (std::is_same_v) { - TT_THROW("MultiDeviceHostStorage doesn't support buffer_address method"); - } else { - raise_unsupported_storage(); - } - }, - self.get_storage()); - }, - R"doc( - Get the address of the underlying buffer. - - The tensor must be on the single device when calling this function. + .. code-block:: python - .. code-block:: python + layout = tt_tensor.get_layout() - address = tt_tensor.buffer_address() + )doc") + .def( + "memory_config", [](const Tensor &self) { return self.memory_config(); }, R"doc( + Get buffer type of TT Tensor. - )doc") - .def( - "get_layout", [](const Tensor &self) { return self.get_layout(); }, R"doc( - Get memory layout of TT Tensor. + .. code-block:: python - .. code-block:: python + memory_config = tt_tensor.memory_config() - layout = tt_tensor.get_layout() + )doc") + .def( + "is_allocated", [](const Tensor &self) { return self.is_allocated(); }, R"doc( + Check if TT Tensor is allocated. - )doc") - .def( - "memory_config", [](const Tensor &self) { return self.memory_config(); }, R"doc( - Get buffer type of TT Tensor. + .. code-block:: python - .. code-block:: python + is_sharded = tt_tensor.is_sharded() - memory_config = tt_tensor.memory_config() + )doc") + .def("is_contiguous", [](const Tensor &self) -> bool { return self.is_contiguous(); }) + .def( + "is_sharded", [](const Tensor &self) { return self.is_sharded(); }, R"doc( + Check if TT Tensor is sharded. - )doc") - .def( - "is_allocated", [](const Tensor &self) { return self.is_allocated(); }, R"doc( - Check if TT Tensor is allocated. + .. code-block:: python - .. code-block:: python + is_sharded = tt_tensor.is_sharded() - is_sharded = tt_tensor.is_sharded() + )doc") + .def( + "get_dtype", [](const Tensor &self) { return self.get_dtype(); }, R"doc( + Get dtype of TT Tensor. - )doc") - .def("is_contiguous", [](const Tensor &self) -> bool { return self.is_contiguous(); }) - .def( - "is_sharded", [](const Tensor &self) { return self.is_sharded(); }, R"doc( - Check if TT Tensor is sharded. + .. code-block:: python - .. code-block:: python + dtype = tt_tensor.get_dtype() + )doc") + .def( + "shape_without_padding", + [](const Tensor &self) { return Shape{self.get_legacy_shape().without_padding()}; }, + R"doc( + Get shape without padding of TT Tensor. - is_sharded = tt_tensor.is_sharded() + .. code-block:: python - )doc") - .def( - "get_dtype", [](const Tensor &self) { return self.get_dtype(); }, R"doc( - Get dtype of TT Tensor. + dtype = tt_tensor.shape_without_padding() + )doc") + .def( + "reshape", + [](Tensor &self, int N, int C, int H, int W) { return self.reshape(N, C, H, W); }, + R"doc( + Reshapes TT tensor .. code-block:: python - dtype = tt_tensor.get_dtype() + reshaped_tensor = tt_tensor.reshape(N, C, H, W) )doc") - .def( - "shape_without_padding", - [](const Tensor &self) { return Shape{self.get_legacy_shape().without_padding()}; }, - R"doc( - Get shape without padding of TT Tensor. + .def( + "reshape", + [](Tensor &self, const tt::tt_metal::Shape &shape) -> Tensor { return self.reshape(shape); }, + R"doc( + Reshapes TT tensor .. code-block:: python - dtype = tt_tensor.shape_without_padding() + reshaped_tensor = tt_tensor.reshape((4, 3, 32)) )doc") - .def( - "reshape", - [](Tensor &self, int N, int C, int H, int W) { return self.reshape(N, C, H, W); }, - R"doc( - Reshapes TT tensor - - .. code-block:: python - - reshaped_tensor = tt_tensor.reshape(N, C, H, W) - )doc") - .def( - "reshape", - [](Tensor &self, const tt::tt_metal::Shape &shape) -> Tensor { return self.reshape(shape); }, - R"doc( - Reshapes TT tensor - - .. code-block:: python - - reshaped_tensor = tt_tensor.reshape((4, 3, 32)) - )doc") - .def_property( - "tensor_id", - [](const Tensor &self) { return self.tensor_id; }, - [](Tensor &self, std::size_t tensor_id) { self.tensor_id = tensor_id; }); - } + .def_property( + "tensor_id", + [](const Tensor &self) { return self.tensor_id; }, + [](Tensor &self, std::size_t tensor_id) { self.tensor_id = tensor_id; }); +} } // namespace ttnn::tensor diff --git a/ttnn/cpp/pybind11/tensor.cpp b/ttnn/cpp/pybind11/tensor.cpp index c9281f27172..9a0bb65322a 100644 --- a/ttnn/cpp/pybind11/tensor.cpp +++ b/ttnn/cpp/pybind11/tensor.cpp @@ -64,7 +64,7 @@ void implement_buffer_protocol(py::module& m_tensor, std::string_view name) { } // namespace detail -void py_module_types(py::module& m_tensor) { +void tensor_mem_config_module_types(py::module& m_tensor) { export_enum(m_tensor); export_enum(m_tensor); export_enum(m_tensor); @@ -115,7 +115,7 @@ void py_module_types(py::module& m_tensor) { } -void tensor_module(py::module& m_tensor) { +void tensor_mem_config_module(py::module& m_tensor) { using tt::tt_metal::Shape; auto py_core_coord = static_cast>(m_tensor.attr("CoreCoord")); @@ -291,8 +291,4 @@ void tensor_module(py::module& m_tensor) { } -void py_module(py::module& module) { - tensor_module(module); - } - } // namespace ttnn::tensor diff --git a/ttnn/cpp/pybind11/tensor.hpp b/ttnn/cpp/pybind11/tensor.hpp index 3bc367c0c1b..2d26484ac60 100644 --- a/ttnn/cpp/pybind11/tensor.hpp +++ b/ttnn/cpp/pybind11/tensor.hpp @@ -10,9 +10,9 @@ namespace py = pybind11; namespace ttnn::tensor { - void pytensor_module_types(pybind11::module & m_tensor); - void pytensor_module(pybind11::module & m_tensor); - void py_module_types(py::module& module); - void py_module(py::module& module); +void pytensor_module_types(py::module & m_tensor); +void pytensor_module(py::module & m_tensor); +void tensor_mem_config_module_types(py::module& module); +void tensor_mem_config_module(py::module& module); } // namespace ttnn::tensor