Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#8835: Introduce a faster more flexible operation infra v2 #9279

Merged
merged 5 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions tests/ttnn/unit_tests/operations/test_relational.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,16 +236,16 @@ def test_expand_and_broadcast(device, h, w):
@pytest.mark.parametrize("h", [500])
@pytest.mark.parametrize("w", [512])
def test_expand_and_broadcast_reversed(device, h, w):
torch_a = torch.rand((1, h, w), dtype=torch.bfloat16)
torch_b = torch.rand((h, w), dtype=torch.bfloat16)
torch_output = torch.lt(torch_b, torch_a)
torch_input_tensor_a = torch.rand((1, h, w), dtype=torch.bfloat16)
torch_input_tensor_b = torch.rand((h, w), dtype=torch.bfloat16)
torch_output = torch.lt(torch_input_tensor_b, torch_input_tensor_a)

a = ttnn.from_torch(torch_a, layout=ttnn.TILE_LAYOUT, device=device)
b = ttnn.from_torch(torch_b, layout=ttnn.TILE_LAYOUT, device=device)
tt_output = ttnn.lt(b, a)
tt_output = ttnn.to_torch(tt_output)
input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
output = ttnn.lt(input_tensor_b, input_tensor_a)
output = ttnn.to_torch(output)

assert_with_pcc(torch_output, tt_output, 0.9999)
assert_with_pcc(torch_output, output, 0.9999)


@pytest.mark.parametrize("atol", [1e-8, 1e-10])
Expand Down
12 changes: 11 additions & 1 deletion tt_eager/tensor/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ struct Tensor {
bool track_ref_count = false;
TensorAttributes(const Storage storage, const ttnn::Shape shape, DataType dtype, Layout layout) :
storage(storage), shape(shape), dtype(dtype), layout(layout) {}
TensorAttributes() : shape({0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
TensorAttributes() :
shape(std::array<uint32_t, 4>{0xff, 0xff, 0xff, 0xff}), dtype(DataType::INVALID), layout(Layout::INVALID) {}
~TensorAttributes() = default;

// Use these functions to manage the main_thread_ref_count for a tensor attr instance.
Expand Down Expand Up @@ -392,6 +393,15 @@ Tensor create_device_tensor(
Device *device,
const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED});

static Tensor create_device_tensor(
const ttnn::Shape &shape,
DataType dtype,
Layout layout,
Device *device,
const MemoryConfig &memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) {
return create_device_tensor(shape.value(), dtype, layout, device, memory_config);
}

// template<typename Buffer>
// void *get_host_buffer(const Tensor &tensor);
void *get_raw_host_data_ptr(const Tensor &tensor);
Expand Down
22 changes: 15 additions & 7 deletions tt_eager/tensor/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,19 +173,21 @@ class Shape {
}

template <std::size_t Rank>
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
rank_(Rank), dimensions_{}, padding_{Rank} {
for (auto index = 0; index < Rank; index++) {
auto padded_dimension = shape_tile_padding[index];
auto padded_dimension = shape_with_tile_padding[index];
this->dimensions_[index] = padded_dimension;
this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
}
}
explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_tile_padding) :
explicit Shape(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) :
rank_(shape.size()), dimensions_{}, padding_{shape.size()} {
TT_ASSERT(shape.size() == shape_tile_padding.size(), "Shape and shape_tile_padding must have the same size");
TT_ASSERT(
shape.size() == shape_with_tile_padding.size(),
"Shape and shape_with_tile_padding must have the same size");
for (auto index = 0; index < shape.size(); index++) {
auto padded_dimension = shape_tile_padding[index];
auto padded_dimension = shape_with_tile_padding[index];
this->dimensions_[index] = padded_dimension;
this->padding_[index] = {.front = 0, .back = padded_dimension - shape[index]};
}
Expand Down Expand Up @@ -720,14 +722,20 @@ struct Shape {
explicit Shape(const std::array<uint32_t, Rank> &shape) : ranked_shape{RankedShape<Rank>{shape}} {}

template <std::size_t Rank>
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_tile_padding) :
ranked_shape{RankedShape<Rank>{shape, shape_tile_padding}} {}
explicit Shape(const std::array<uint32_t, Rank> &shape, const std::array<uint32_t, Rank> &shape_with_tile_padding) :
ranked_shape{RankedShape<Rank>{shape, shape_with_tile_padding}} {}

template <std::size_t Rank>
explicit Shape(
const std::array<uint32_t, Rank> &shape, const std::array<std::array<uint32_t, 2>, Rank> &tile_padding) :
ranked_shape{RankedShape<Rank>{shape, tile_padding}} {}

static Shape from_vector(const std::vector<uint32_t> &shape) { return Shape{tt::tt_metal::Shape{shape}}; }

static Shape from_vector(const std::vector<uint32_t> &shape, const std::vector<uint32_t> &shape_with_tile_padding) {
return Shape{tt::tt_metal::Shape{shape, shape_with_tile_padding}};
}

const auto rank() const {
return std::visit(
[]<std::size_t Rank>(const RankedShape<Rank> &shape) -> const auto { return Rank; }, this->ranked_shape);
Expand Down
8 changes: 6 additions & 2 deletions tt_eager/tt_dnn/op_library/run_operation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
#include "tt_metal/tt_stl/reflection.hpp"
#include "tt_numpy/functions.hpp"

namespace tt::tt_metal {
std::atomic<uint32_t> operation_id_atomic_count = 0;
}

namespace tt::tt_metal::operation {

namespace detail {
Expand Down Expand Up @@ -119,7 +123,7 @@ constexpr auto decorate_device_operation(const Function& function) {
template <typename OutputTensors>
OutputTensors run_host_operation(const HostOperation<OutputTensors>& operation, const Tensors& input_tensors) {
ZoneScopedN("TT_DNN_HOST_OP");
uint32_t op_id = assign_id();
uint32_t op_id = assign_operation_id();

operation.validate(input_tensors);
auto output_tensors = operation.compute_output_tensors(input_tensors);
Expand All @@ -143,7 +147,7 @@ OutputTensors run_device_operation(
const OptionalConstTensors& optional_input_tensors,
const OptionalTensors& optional_output_tensors) {
ZoneScopedN("TT_DNN_DEVICE_OP");
uint32_t op_id = assign_id();
uint32_t op_id = assign_operation_id();

std::function<std::variant<std::shared_ptr<Program>, std::reference_wrapper<Program>>(
const DeviceOperation<OutputTensors>&,
Expand Down
6 changes: 0 additions & 6 deletions tt_eager/tt_dnn/op_library/run_operation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,12 +276,6 @@ inline void log_operation(
const OptionalTensors& optional_output_tensors = {}) {}
#endif

inline uint32_t assign_id()
{
static std::atomic<uint32_t> atomic_count{0};
return atomic_count.fetch_add(1);
}

template<class OutputTensors=Tensors>
OutputTensors run(
const HostOperation<OutputTensors>& operation,
Expand Down
2 changes: 1 addition & 1 deletion tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_pytensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ Tensor convert_python_tensors_to_tt_tensors(py::list tensor_shards, std::optiona
ZoneScopedN("TT_DNN_FALLBACK_OP");
auto [op, input_tensors] = detail::parse_external_operation(function, args, kwargs, function_name);
operation::log_operation(op, input_tensors);
uint32_t op_id = tt::tt_metal::operation::assign_id();
uint32_t op_id = tt::tt_metal::assign_operation_id();

auto output_tensors = function(*args, **kwargs);

Expand Down
5 changes: 4 additions & 1 deletion tt_metal/impl/device/program_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ struct ProgramCache {

private:
inline static bool is_enabled_ = false;
std::unordered_map<uint64_t, tt::stl::unique_any<1024, 32>> cache_{};

static constexpr auto MAX_CACHED_PROGRAM_SIZE = 1024;
static constexpr auto ALIGNMENT = 32;
std::unordered_map<uint64_t, tt::stl::unique_any<MAX_CACHED_PROGRAM_SIZE, ALIGNMENT>> cache_{};
};

}
Expand Down
97 changes: 97 additions & 0 deletions tt_metal/tools/profiler/op_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include <filesystem>
#include <tuple>
#include <type_traits>

#include "tensor/tensor.hpp"
Expand All @@ -22,6 +23,12 @@ namespace tt {

namespace tt_metal {

extern std::atomic<uint32_t> operation_id_atomic_count;

inline uint32_t assign_operation_id() {
return operation_id_atomic_count.fetch_add(1);
}

namespace op_profiler {

enum class OpType { python_fallback, tt_dnn_cpu, tt_dnn_device, unknown };
Expand Down Expand Up @@ -251,6 +258,48 @@ inline json get_base_json(
return j;
}

template <typename operation_t>
inline json get_base_json(
uint32_t operation_id,
const typename operation_t::operation_attributes_t& operation_attributes,
const typename operation_t::tensor_args_t& tensor_args,
typename operation_t::tensor_return_value_t& tensor_return_value) {
ZoneScoped;
json j;
j["global_call_count"] = operation_id;

auto as_string = [](std::string_view v) -> std::string { return {v.data(), v.size()}; };
std::string opName = as_string(tt::stl::get_type_name<operation_t>());
std::replace(opName.begin(), opName.end(), ',', ';');
j["op_code"] = opName;

json attributesObj;
constexpr auto& attribute_names = std::decay_t<decltype(operation_attributes)>::attribute_names;
const auto attribute_values = operation_attributes.attribute_values();
[&attributesObj, &attribute_names, &attribute_values]<size_t... Ns>(std::index_sequence<Ns...>) {
(
[&attributesObj, &attribute_names, &attribute_values] {
const auto& attribute_name = std::get<Ns>(attribute_names);
const auto& attribute = std::get<Ns>(attribute_values);
attributesObj[attribute_name] = fmt::format("{}", attribute);
}(),
...);
}(std::make_index_sequence<std::tuple_size_v<std::decay_t<decltype(attribute_names)>>>{});
j["attributes"] = attributesObj;

std::vector<json> input_tensors;
tt::stl::reflection::visit_object_of_type<Tensor>(
[&input_tensors](auto&& tensor) { input_tensors.push_back(get_tensor_json(tensor)); }, tensor_args);
j["input_tensors"] = input_tensors;

std::vector<json> output_tensors;
tt::stl::reflection::visit_object_of_type<Tensor>(
[&output_tensors](auto&& tensor) { output_tensors.push_back(get_tensor_json(tensor)); }, tensor_return_value);
j["output_tensors"] = output_tensors;

return j;
}

inline std::string op_meta_data_serialized_json(
uint32_t opID, const tt::tt_metal::operation::ExternalOperation& op, const std::vector<Tensor>& input_tensors) {
auto j = get_base_json<true>(opID, op, input_tensors);
Expand Down Expand Up @@ -322,6 +371,37 @@ inline std::string op_meta_data_serialized_json(
}
}

template <typename operation_t>
inline std::string op_meta_data_serialized_json(
const operation_t& operation,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type erasure would be really nice here to make this method non-templated

uint32_t operation_id,
auto device_id,
const auto& program,
const auto& program_hash,
const auto& operation_attributes,
const auto& tensor_args,
auto& tensor_return_value) {
auto j = get_base_json<operation_t>(operation_id, operation_attributes, tensor_args, tensor_return_value);
j["op_type"] = magic_enum::enum_name(OpType::tt_dnn_device);
j["device_id"] = device_id;
j["op_hash"] = program_hash;
j["kernel_info"] = get_kernels_json(program);

j["optional_input_tensors"] = std::vector<json>{};

auto perfModel = operation_t::create_op_performance_model(operation_attributes, tensor_args, tensor_return_value);
j["performance_model"]["compute_ns"] = perfModel.get_compute_ns();
j["performance_model"]["ideal_ns"] = perfModel.get_ideal_ns();
j["performance_model"]["bandwidth_ns"] = perfModel.get_bandwidth_ns();
j["performance_model"]["input_bws"] = perfModel.get_input_bws();
j["performance_model"]["output_bws"] = perfModel.get_output_bws();

std::string short_str = fmt::format("`TT_DNN_DEVICE_OP: {}, {}, {}, ", j["op_code"], program_hash, device_id);

std::string ser = j.dump(4);
return fmt::format("{}{} ->\n{}`", short_str, operation_id, ser);
}

#define TracyOpTTNNDevice( \
op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors) \
std::string op_message = op_profiler::op_meta_data_serialized_json( \
Expand All @@ -338,6 +418,21 @@ inline std::string op_meta_data_serialized_json(
ZoneText(op_text.c_str(), op_text.size()); \
TracyMessage(op_message.c_str(), op_message.size());

#define TracyOpTNNNDeviceV2( \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it possible to mark TracyOpTTNNDevice as deprecated somehow?

operation, operation_id, device_id, program, program_hash, operation_attributes, tensor_args, tensor_return_value) \
std::string op_message = op_profiler::op_meta_data_serialized_json( \
operation, \
operation_id, \
device_id, \
program, \
program_hash, \
operation_attributes, \
tensor_args, \
tensor_return_value); \
std::string op_text = fmt::format("id:{}", operation_id); \
ZoneText(op_text.c_str(), op_text.size()); \
TracyMessage(op_message.c_str(), op_message.size());

#define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors) \
std::string op_message = \
op_profiler::op_meta_data_serialized_json(op_id, operation, input_tensors, output_tensors); \
Expand All @@ -355,6 +450,8 @@ inline std::string op_meta_data_serialized_json(

#define TracyOpTTNNDevice( \
op_id, op_hash, is_cached, device_id, operation, program, input_tensors, optional_input_tensors, output_tensors)
#define TracyOpTNNNDeviceV2( \
operation, operation_id, device_id, program, program_hash, operation_attributes, tensor_args, tensor_return_value)
#define TracyOpTTNNHost(op_id, operation, input_tensors, output_tensors)
#define TracyOpTTNNExternal(op_id, op, input_tensors)

Expand Down
42 changes: 42 additions & 0 deletions tt_metal/tt_stl/reflection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,48 @@ std::ostream& operator<<(std::ostream& os, const std::set<T>& set) {
return os;
}

template <typename to_visit_t, typename T>
requires std::same_as<std::decay_t<T>, to_visit_t>
constexpr auto visit_object_of_type(auto callback, T&& value) {
callback(value);
}

template <typename to_visit_t, typename T>
constexpr auto visit_object_of_type(auto callback, const std::optional<T>& value) {
if (value.has_value()) {
visit_object_of_type<to_visit_t>(callback, value.value());
}
}

template <typename to_visit_t, typename T>
constexpr auto visit_object_of_type(auto callback, const std::vector<T>& value) {
for (auto& tensor : value) {
visit_object_of_type<to_visit_t>(callback, tensor);
}
}

template <typename to_visit_t, typename T, auto N>
constexpr auto visit_object_of_type(auto callback, const std::array<T, N>& value) {
for (auto& tensor : value) {
visit_object_of_type<to_visit_t>(callback, tensor);
}
}

template <typename to_visit_t, typename... Ts>
constexpr auto visit_object_of_type(auto callback, const std::tuple<Ts...>& value) {
constexpr auto num_attributes = sizeof...(Ts);
[&callback, &value]<size_t... Ns>(std::index_sequence<Ns...>) {
(visit_object_of_type<to_visit_t>(callback, std::get<Ns>(value)), ...);
}(std::make_index_sequence<num_attributes>{});
}

template <typename to_visit_t, typename T>
requires(not std::same_as<std::decay_t<T>, to_visit_t>) and requires { std::decay_t<T>::attribute_names; }
constexpr auto visit_object_of_type(auto callback, T&& object) {
constexpr auto num_attributes = std::tuple_size_v<decltype(std::decay_t<T>::attribute_names)>;
visit_object_of_type<to_visit_t>(callback, object.attribute_values());
}

} // namespace reflection
} // namespace stl
} // namespace tt
Expand Down
4 changes: 4 additions & 0 deletions ttnn/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ set(TTNN_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/matmul.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/reduction/argmax/device/argmax_op.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/binary_op.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp
)

Expand Down
4 changes: 2 additions & 2 deletions ttnn/cpp/ttnn/decorators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ struct operation_t {

template <typename... args_t>
auto operator()(args_t&&... args) const {
ZoneScoped;
ZoneScopedN("Run ttnn operation (struct-based)");
ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
tt::log_debug(tt::LogOp, "Started C++ ttnn operation: {}", this->cpp_fully_qualified_name);

Expand Down Expand Up @@ -332,7 +332,7 @@ struct lambda_operation_t {

template <typename... args_t>
auto operator()(args_t&&... args) const {
ZoneScoped;
ZoneScopedN("Run ttnn operation (lambda-based)");
ZoneName(this->cpp_fully_qualified_name, std::strlen(this->cpp_fully_qualified_name));
tt::log_debug(tt::LogOp, "Started C++ ttnn operation: {}", this->cpp_fully_qualified_name);
auto output = this->lambda(std::forward<decltype(args)>(args)...);
Expand Down
Loading
Loading