From bf1d922039a0991b558ff0b6215f6c5e2352551f Mon Sep 17 00:00:00 2001 From: Milan Topalovic <163355844+mtopalovicTT@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:46:12 +0200 Subject: [PATCH 01/16] Adding `unsqueeze` op (#544) Adding `unsqueeze` op --- include/ttmlir/Dialect/TTIR/IR/TTIROps.td | 20 ++++++++ lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp | 47 ++++++++++++++++++- lib/Dialect/TTIR/IR/TTIROps.cpp | 45 ++++++++++++++++++ .../ttmlir/Dialect/TTNN/simple_unsqueeze.mlir | 10 ++++ 4 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td index 5479dcb2f7..fe85f9bb76 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td @@ -428,6 +428,26 @@ def TTIR_SqueezeOp : TTIR_DPSOp<"squeeze"> { let hasVerifier = 1; } +def TTIR_UnsqueezeOp : TTIR_DPSOp<"unsqueeze"> { + let summary = "Unsqueeze op."; + let description = [{ + Unsqueeze tensor. + }]; + + let arguments = (ins AnyRankedTensor:$input, + AnyRankedTensor:$output, + SI32Attr:$dim, + TT_OperandConstraintArrayAttr:$operand_constraints); + + let results = (outs AnyRankedTensor:$result); + + let extraClassDeclaration = [{ + MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); } + }]; + + let hasVerifier = 1; +} + // ANCHOR: adding_an_op_matmul_ttir def TTIR_MatmulOp : TTIR_DPSOp<"matmul"> { let summary = "Matrix multiply operation."; diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp index 1818898408..e8cb8cd287 100644 --- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp +++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp @@ -193,7 +193,7 @@ class SqueezeOpConversionPattern : public OpConversionPattern { LogicalResult matchAndRewrite(ttir::SqueezeOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - // Extract input tensor types + // Extract input tensor type ::mlir::RankedTensorType inputType = mlir::cast<::mlir::RankedTensorType>(adaptor.getInput().getType()); @@ -224,6 +224,50 @@ class SqueezeOpConversionPattern : public OpConversionPattern { } }; +class UnsqueezeOpConversionPattern + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(ttir::UnsqueezeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Extract input tensor type + ::mlir::RankedTensorType inputType = + mlir::cast<::mlir::RankedTensorType>(adaptor.getInput().getType()); + + // Get the unsqueeze dimension + int32_t dim = adaptor.getDim(); + + // Convert negative dim to its positive equivalent + if (dim < 0) { + dim += inputType.getRank() + 1; + } + + // Get the shape of the input tensor + auto inputShape = inputType.getShape(); + llvm::SmallVector newShape; + + // Insert the new dimension + for (int i = 0; i < inputType.getRank(); ++i) { + if (i == dim) { + newShape.push_back(1); + } + newShape.push_back(inputShape[i]); + } + + // Create the new shape attribute + auto shapeAttr = rewriter.getI32ArrayAttr(newShape); + + // Replace the UnsqueezeOp with a ReshapeOp + rewriter.replaceOpWithNewOp( + op, this->getTypeConverter()->convertType(op.getType()), + adaptor.getInput(), adaptor.getOutput(), shapeAttr); + + return success(); + } +}; + } // namespace // ANCHOR: adding_an_op_matmul_op_rewriter @@ -269,6 +313,7 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns, ConcatOpConversionPattern, ReshapeOpConversionPattern, SqueezeOpConversionPattern, + UnsqueezeOpConversionPattern, MatmulOpConversionPattern >(typeConverter, ctx); // ANCHOR_END: op_rewriter_pattern_set diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp index 721928aead..85255e497e 100644 --- a/lib/Dialect/TTIR/IR/TTIROps.cpp +++ b/lib/Dialect/TTIR/IR/TTIROps.cpp @@ -305,6 +305,51 @@ ::mlir::LogicalResult mlir::tt::ttir::SqueezeOp::verify() { return success(); } +::mlir::LogicalResult mlir::tt::ttir::UnsqueezeOp::verify() { + ::mlir::RankedTensorType inputType = getInput().getType(); + ::mlir::RankedTensorType outputType = getOutput().getType(); + int32_t dim = getDim(); + + // Convert negative dim to its positive equivalent + if (dim < 0) { + dim += inputType.getRank() + 1; + } + + // Check that the dim is within the bounds of the input tensor + if (dim > inputType.getRank() || dim < 0) { + return emitOpError( + "Dimension attribute must be within the bounds of the input tensor"); + } + + // Check that the output tensor has one more dimension than the input tensor + if (outputType.getRank() != inputType.getRank() + 1) { + return emitOpError( + "Output tensor must have one more dimension than the input tensor"); + } + + // and that the dimension added is of size 1 + if (outputType.getDimSize(dim) != 1) { + return emitOpError("Dimension added must be of size 1"); + } + + // All dimensions of the input tensor must be the same as the output tensor + // except for the dimension added + for (int64_t i = 0, j = 0; i < outputType.getRank(); ++i) { + if (i == dim) { + continue; + } + + if (inputType.getDimSize(j) != outputType.getDimSize(i)) { + return emitOpError("All dimensions of the input tensor must be the same " + "as the output tensor except for the dimension added"); + } + + j++; + } + + return success(); +} + // ANCHOR: adding_an_op_matmul_ttir_verify ::mlir::LogicalResult mlir::tt::ttir::MatmulOp::verify() { ::mlir::RankedTensorType inputAType = getA().getType(); diff --git a/test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir b/test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir new file mode 100644 index 0000000000..2400b6b5ed --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir @@ -0,0 +1,10 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s| FileCheck %s +#any_device_tile = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<4x2x32x32xbf16>) -> tensor<4x1x2x32x32xbf16> { + %0 = tensor.empty() : tensor<4x1x2x32x32xbf16> + // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]] + %1 = "ttir.unsqueeze"(%arg0, %0) <{dim = -4 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<4x2x32x32xbf16>, tensor<4x1x2x32x32xbf16>) -> tensor<4x1x2x32x32xbf16> + return %1 : tensor<4x1x2x32x32xbf16> + } +} From 6595400dc22572c4e852b6e3bb61e265b84f4a0c Mon Sep 17 00:00:00 2001 From: Lewis Panos Date: Fri, 30 Aug 2024 10:54:07 -0400 Subject: [PATCH 02/16] Bringup E2E conv2d op (#484) --- include/ttmlir/Dialect/TTIR/IR/TTIROps.td | 31 ++++++++++ include/ttmlir/Dialect/TTNN/IR/TTNNOps.td | 34 +++++++++++ include/ttmlir/Target/TTNN/program.fbs | 22 +++++++ lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp | 68 +++++++++++++++++++++- lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp | 4 ++ lib/Dialect/TTIR/IR/TTIROps.cpp | 19 ++++++ lib/Dialect/TTIR/Transforms/Passes.cpp | 5 ++ lib/Dialect/TTNN/IR/TTNNOps.cpp | 24 ++++++++ lib/Target/TTNN/TTNNToFlatbuffer.cpp | 25 ++++++++ runtime/include/tt/runtime/detail/ttnn.h | 5 ++ runtime/lib/ttnn/program.cpp | 34 +++++++++++ runtime/lib/ttnn/runtime.cpp | 2 +- test/ttmlir/Dialect/TTNN/simple_conv.mlir | 10 ++++ 13 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 test/ttmlir/Dialect/TTNN/simple_conv.mlir diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td index fe85f9bb76..edfee781a3 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td @@ -377,6 +377,37 @@ def TTIR_ConcatOp : TTIR_DPSOp<"concat"> { let arguments = (ins Variadic:$inputs, AnyRankedTensor:$output, SI32Attr:$dim, + + TT_OperandConstraintArrayAttr:$operand_constraints); + + let results = (outs AnyRankedTensor:$result); + + let extraClassDeclaration = [{ + MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); } + }]; + + let hasVerifier = 1; +} + +def TTIR_Conv2dOp : TTIR_DPSOp<"conv2d"> { + let summary = "Conv2d operation."; + let description = [{ + Applies a 2D convolution over an input image composed of several input planes. + }]; + + let arguments = (ins AnyRankedTensor:$input, + AnyRankedTensor:$weight, + Optional:$bias, + AnyRankedTensor:$output, + SI32Attr:$stride_height, + SI32Attr:$stride_width, + SI32Attr:$dilation_height, + SI32Attr:$dilation_width, + SI32Attr:$groups, + SI32Attr:$padding_left, + SI32Attr:$padding_right, + SI32Attr:$padding_top, + SI32Attr:$padding_bottom, TT_OperandConstraintArrayAttr:$operand_constraints); let results = (outs AnyRankedTensor:$result); diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td index 08a6a38701..380d798bbf 100644 --- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td +++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td @@ -304,6 +304,40 @@ def TTNN_MatmulOp : TTNN_NamedDPSOp<"matmul"> { } // ANCHOR_END: adding_an_op_matmul_ttnn +def TTNN_Conv2dOp : TTNN_NamedDPSOp<"conv2d"> { + let summary = "Conv2d operation."; + let description = [{ + Applies a 2D convolution over an input image composed of several input planes. + }]; + + let arguments = (ins AnyRankedTensor:$input, + AnyRankedTensor:$weight, + Optional:$bias, + AnyRankedTensor:$output, + I32Attr:$in_channels, + I32Attr:$out_channels, + I32Attr:$batch_size, + I32Attr:$input_height, + I32Attr:$input_width, + I32Attr:$kernel_height, + I32Attr:$kernel_width, + I32Attr:$stride_height, + I32Attr:$stride_width, + I32Attr:$padding_height, + I32Attr:$padding_width, + I32Attr:$dilation_height, + I32Attr:$dilation_width, + I32Attr:$groups); + + let results = (outs AnyRankedTensor:$result); + + let extraClassDeclaration = [{ + MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); } + }]; + + let hasVerifier = 1; +} + def TTNN_EmptyOp : TTNN_Op<"empty"> { let summary = "Empty op."; let description = [{ diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs index b163e2a13c..954f509a74 100644 --- a/include/ttmlir/Target/TTNN/program.fbs +++ b/include/ttmlir/Target/TTNN/program.fbs @@ -100,6 +100,27 @@ table MatmulOp { } // ANCHOR_END: adding_an_op_matmul_fbs +table Conv2dOp { + input: tt.target.TensorRef; + weight: tt.target.TensorRef; + bias: tt.target.TensorRef; + out: tt.target.TensorRef; + in_channels: uint32; + out_channels: uint32; + batch_size: uint32; + input_height: uint32; + input_width: uint32; + kernel_height: uint32; + kernel_width: uint32; + stride_height: uint32; + stride_width: uint32; + padding_height: uint32; + padding_width: uint32; + dilation_height: uint32; + dilation_width: uint32; + groups: uint32; +} + union OpType { OpenDeviceOp, CloseDeviceOp, @@ -112,6 +133,7 @@ union OpType { EmbeddingOp, SoftmaxOp, TransposeOp, + Conv2dOp, ConcatOp, ReshapeOp } diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp index e8cb8cd287..6f51eaa580 100644 --- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp +++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp @@ -286,6 +286,71 @@ class MatmulOpConversionPattern : public OpConversionPattern { }; // ANCHOR_END: adding_an_op_matmul_op_rewriter +class Conv2dOpConversionPattern : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(ttir::Conv2dOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + auto kernel_ty = + mlir::cast(adaptor.getWeight().getType()); + llvm::ArrayRef kernel_shape = kernel_ty.getShape(); + + auto input_ty = mlir::cast(adaptor.getInput().getType()); + llvm::ArrayRef input_shape = input_ty.getShape(); + + auto output_ty = + mlir::cast(adaptor.getOutput().getType()); + llvm::ArrayRef output_shape = output_ty.getShape(); + + auto in_channels = + rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 1]); + auto out_channels = + rewriter.getI32IntegerAttr(output_shape[output_shape.size() - 1]); + auto batch_size = + rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 4]); + auto input_height = + rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 3]); + auto input_width = + rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 2]); + + auto kernel_height = + rewriter.getI32IntegerAttr(kernel_shape[kernel_shape.size() - 2]); + auto kernel_width = + rewriter.getI32IntegerAttr(kernel_shape[kernel_shape.size() - 1]); + + auto stride_height = rewriter.getI32IntegerAttr(adaptor.getStrideHeight()); + auto stride_width = rewriter.getI32IntegerAttr(adaptor.getStrideWidth()); + + assert( + adaptor.getPaddingBottom() == adaptor.getPaddingTop() && + "TTNN only supports padding height/width attributes. Thus, padding_top " + "must equal padding_bottom for the op to execute as expected."); + assert(adaptor.getPaddingLeft() == adaptor.getPaddingRight() && + "TTNN only supports padding height/width attributes. Thus, " + "padding_left must equal padding_right for the op to execute as " + "expected."); + auto padding_height = rewriter.getI32IntegerAttr(adaptor.getPaddingTop()); + auto padding_width = rewriter.getI32IntegerAttr(adaptor.getPaddingRight()); + + auto dilation_height = + rewriter.getI32IntegerAttr(adaptor.getDilationHeight()); + auto dilation_width = + rewriter.getI32IntegerAttr(adaptor.getDilationWidth()); + auto groups = rewriter.getI32IntegerAttr(adaptor.getGroups()); + + rewriter.replaceOpWithNewOp( + op, this->getTypeConverter()->convertType(op.getType()), + adaptor.getInput(), adaptor.getWeight(), adaptor.getBias(), + adaptor.getOutput(), in_channels, out_channels, batch_size, input_width, + input_height, kernel_height, kernel_width, stride_height, stride_width, + padding_height, padding_width, dilation_height, dilation_width, groups); + return success(); + } +}; + namespace mlir::tt { void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns, @@ -314,7 +379,8 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns, ReshapeOpConversionPattern, SqueezeOpConversionPattern, UnsqueezeOpConversionPattern, - MatmulOpConversionPattern + MatmulOpConversionPattern, + Conv2dOpConversionPattern >(typeConverter, ctx); // ANCHOR_END: op_rewriter_pattern_set // clang-format on diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp index 88533cac48..1aee94c81b 100644 --- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp +++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp @@ -203,6 +203,10 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx, patterns.add>(typeConverter, ctx); patterns.add>(typeConverter, ctx); + // Conv ops + // + patterns.add>(typeConverter, ctx); + // Other ops // patterns.add>(typeConverter, ctx); diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp index 85255e497e..49d2a97429 100644 --- a/lib/Dialect/TTIR/IR/TTIROps.cpp +++ b/lib/Dialect/TTIR/IR/TTIROps.cpp @@ -387,6 +387,25 @@ ::mlir::LogicalResult mlir::tt::ttir::MatmulOp::verify() { } // ANCHOR_END: adding_an_op_matmul_ttir_verify +::mlir::LogicalResult mlir::tt::ttir::Conv2dOp::verify() { + ::mlir::RankedTensorType inputType = getInput().getType(); + ::mlir::RankedTensorType weightType = getWeight().getType(); + ::mlir::RankedTensorType biasType = + llvm::dyn_cast_or_null<::mlir::RankedTensorType>(getBias().getType()); + if (inputType.getRank() < 3) { + return emitOpError("Input must be at least a 3D tensor"); + } + if (weightType.getRank() != 4) { + return emitOpError("Weight must be a 4D tensor"); + } + if (biasType) { + if (biasType.getRank() != 4) { + return emitOpError("Bias must be a 4D tensor"); + } + } + return success(); +} + ::mlir::LogicalResult mlir::tt::ttir::AllocOp::verify() { auto layout = mlir::dyn_cast_or_null( getResult().getType().getEncoding()); diff --git a/lib/Dialect/TTIR/Transforms/Passes.cpp b/lib/Dialect/TTIR/Transforms/Passes.cpp index aca4598a5c..bf303f65e5 100644 --- a/lib/Dialect/TTIR/Transforms/Passes.cpp +++ b/lib/Dialect/TTIR/Transforms/Passes.cpp @@ -572,6 +572,11 @@ class TTIRLayoutDPSOperandsRewriter for (auto &operand : op->getOpOperands()) { bool isResult = op.isDpsInit(&operand); + // TTNN Conv2d moves input, weight, and bias from host to device + // itself. Inserting the ToLayoutOp on these operands is thus problematic. + if (mlir::isa(op.getOperation()) && !isResult) { + continue; + } auto operandConstraint = mlir::cast( mlir::cast(op.getOperation()) diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp index 00dc77dce5..81d3c5c8b3 100644 --- a/lib/Dialect/TTNN/IR/TTNNOps.cpp +++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp @@ -248,6 +248,30 @@ ::mlir::LogicalResult mlir::tt::ttnn::MatmulOp::verify() { } // ANCHOR_END: adding_an_op_matmul_ttnn_verify +::mlir::LogicalResult mlir::tt::ttnn::Conv2dOp::verify() { + ::mlir::RankedTensorType inputType = getInput().getType(); + ::mlir::RankedTensorType weightType = getWeight().getType(); + ::mlir::RankedTensorType biasType = + llvm::dyn_cast_or_null<::mlir::RankedTensorType>(getBias().getType()); + + if (inputType.getRank() < 3) { + return emitOpError("Input must be at least a 3D tensor"); + } + if (weightType.getRank() != 4) { + return emitOpError("Weight must be a 4D tensor"); + } + if (biasType) { + if (biasType.getRank() != 4) { + return emitOpError("Bias must be a 4D tensor"); + } + auto biasShape = biasType.getShape(); + if (biasShape[0] != 1 || biasShape[1] != 1 || biasShape[2] != 1) { + return emitOpError("Bias must only have data on the final dimenstion"); + } + } + return success(); +} + ::mlir::LogicalResult AllocOp::verify() { auto layout = mlir::dyn_cast_or_null( getResult().getType().getEncoding()); diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp index 07b9825742..2b0ce61233 100644 --- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp +++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp @@ -26,6 +26,7 @@ #include "ttmlir/Target/Utils/FuncOpToProgram.h" #include "ttmlir/Target/Utils/MLIRToFlatbuffer.h" #include "ttmlir/Version.h" +#include "types_generated.h" namespace mlir::tt::ttnn { @@ -121,6 +122,27 @@ createOp(FlatbufferObjectCache &cache, MatmulOp op) { } // ANCHOR_END: adding_an_op_matmul_serialize_to_binary +::flatbuffers::Offset<::tt::target::ttnn::Conv2dOp> +createOp(FlatbufferObjectCache &cache, Conv2dOp op) { + auto in0 = + cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getInput())); + auto in1 = cache.at<::tt::target::TensorRef>( + getOperandThroughDPSOps(op.getWeight())); + auto in2 = op.getODSOperands(2).empty() + ? flatbuffers::Offset<::tt::target::TensorRef>() + : cache.at<::tt::target::TensorRef>( + getOperandThroughDPSOps(op.getBias())); + auto output = cache.at<::tt::target::TensorRef>( + getOperandThroughDPSOps(op.getResult())); + return ::tt::target::ttnn::CreateConv2dOp( + *cache.fbb, in0, in1, in2, output, op.getInChannels(), + op.getOutChannels(), op.getBatchSize(), op.getInputHeight(), + op.getInputWidth(), op.getKernelHeight(), op.getKernelWidth(), + op.getStrideHeight(), op.getStrideWidth(), op.getPaddingHeight(), + op.getPaddingWidth(), op.getDilationHeight(), op.getDilationWidth(), + op.getGroups()); +} + template ::flatbuffers::Offset<::tt::target::ttnn::EltwiseOp> createEltwiseOp(FlatbufferObjectCache &cache, EltwiseOp op) { @@ -324,6 +346,9 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op, return createOperation(cache, createTransposeOp(cache, transposeOp), debugString); } + if (auto conv2dOp = dyn_cast(op); conv2dOp) { + return createOperation(cache, createOp(cache, conv2dOp), debugString); + } if (auto concatOp = dyn_cast(op); concatOp) { return createOperation(cache, createConcatOp(cache, concatOp), debugString); } diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h index 89405df43d..487bfdc779 100644 --- a/runtime/include/tt/runtime/detail/ttnn.h +++ b/runtime/include/tt/runtime/detail/ttnn.h @@ -39,8 +39,10 @@ #pragma clang diagnostic ignored "-Wunused-but-set-variable" #pragma clang diagnostic ignored "-Wlogical-op-parentheses" #pragma clang diagnostic ignored "-Wundefined-inline" + #define FMT_HEADER_ONLY #include "ttnn/device.hpp" +#include "ttnn/operations/conv/conv2d/conv2d.hpp" #include "ttnn/operations/copy.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/creation.hpp" @@ -59,6 +61,9 @@ namespace tt::runtime::ttnn { +// Default L1 small size to use for the ttnn runtime (32kb). +constexpr std::size_t kL1SmallSize = 1 << 15; + std::pair getCurrentSystemDesc(); Tensor createTensor(std::shared_ptr data, diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp index dd5f0acc78..d43eadba16 100644 --- a/runtime/lib/ttnn/program.cpp +++ b/runtime/lib/ttnn/program.cpp @@ -6,11 +6,14 @@ #include #include #include +#include #include #include "tt/runtime/detail/ttnn.h" #include "tt/runtime/runtime.h" #include "ttmlir/Target/TTNN/program_generated.h" +#include "ttnn/device.hpp" +#include "ttnn/operations/conv/conv2d/conv2d.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/types.hpp" #include "types_generated.h" @@ -475,6 +478,34 @@ run(::tt::target::ttnn::MatmulOp const *op, ::ttnn::Device &device, } // ANCHOR_END: adding_an_op_matmul_runtime +static void +run(::tt::target::ttnn::Conv2dOp const *op, ::ttnn::Device &device, + std::unordered_map &liveTensors, + std::list<::ttnn::Tensor> &tensorPool) { + auto &input = *liveTensors.at(op->input()->global_id()); + auto &weight = *liveTensors.at(op->weight()->global_id()); + std::optional<::ttnn::Tensor> bias = + op->bias() ? std::make_optional(*liveTensors.at(op->bias()->global_id())) + : std::nullopt; + auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig(); + config.dtype = input.dtype(); + config.weights_dtype = weight.dtype(); + + ::ttnn::Tensor out = + std::get<0>(::ttnn::operations::conv::conv2d::conv2d<::ttnn::Device>( + input, weight, &device, op->in_channels(), op->out_channels(), + op->batch_size(), op->input_height(), op->input_width(), + {op->kernel_height(), op->kernel_width()}, + {op->stride_height(), op->stride_width()}, + {op->padding_height(), op->padding_width()}, + {op->dilation_height(), op->dilation_width()}, op->groups(), bias, + config)); + + tensorPool.push_back(out); + liveTensors.insert_or_assign(op->out()->global_id(), &tensorPool.back()); + return; +} + static void run(::tt::target::ttnn::Operation const *op, ::ttnn::Device &device, std::unordered_map &liveTensors, @@ -516,6 +547,9 @@ run(::tt::target::ttnn::Operation const *op, ::ttnn::Device &device, case ::tt::target::ttnn::OpType::TransposeOp: { return run(op->type_as_TransposeOp(), device, liveTensors, tensorPool); } + case ::tt::target::ttnn::OpType::Conv2dOp: { + return run(op->type_as_Conv2dOp(), device, liveTensors, tensorPool); + } case ::tt::target::ttnn::OpType::ConcatOp: { return run(op->type_as_ConcatOp(), device, liveTensors, tensorPool); case ::tt::target::ttnn::OpType::ReshapeOp: { diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp index 37a69c154b..9cf0575138 100644 --- a/runtime/lib/ttnn/runtime.cpp +++ b/runtime/lib/ttnn/runtime.cpp @@ -60,7 +60,7 @@ Device openDevice(std::vector const &deviceIds, std::vector const &numHWCQs) { assert(deviceIds.size() == 1 && "Only one device is supported for now"); assert(numHWCQs.empty() && "HWCQs are not supported for now"); - auto &device = ::ttnn::open_device(deviceIds.front()); + auto &device = ::ttnn::open_device(deviceIds.front(), kL1SmallSize); return Device::borrow(device, DeviceRuntime::TTNN); } diff --git a/test/ttmlir/Dialect/TTNN/simple_conv.mlir b/test/ttmlir/Dialect/TTNN/simple_conv.mlir new file mode 100644 index 0000000000..5a016c5966 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/simple_conv.mlir @@ -0,0 +1,10 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<1x32x32x64xbf16>, %arg1: tensor<64x64x3x3xbf16>, %arg2: tensor<1x1x1x64xbf16>) -> tensor<1x32x32x64xbf16> { + %0 = tensor.empty() : tensor<1x32x32x64xbf16> + // CHECK: %[[C:.*]] = "ttnn.conv2d"[[C:.*]] + %1 = "ttir.conv2d"(%arg0, %arg1, %arg2, %0) <{stride_height=1: si32, stride_width=1: si32, dilation_height=1: si32, dilation_width=1: si32, groups=1: si32, padding_left=1: si32, padding_right=1: si32, padding_top=1: si32, padding_bottom=1: si32, is_convtranspose2d=0: si32, output_height_transpose=0: si32, output_width_transpose=0: si32, stride_transpose=0: si32, operand_constraints = [#any_device, #any_device, #any_device, #any_device]}> : (tensor<1x32x32x64xbf16>, tensor<64x64x3x3xbf16>, tensor<1x1x1x64xbf16>, tensor<1x32x32x64xbf16>) -> tensor<1x32x32x64xbf16> + return %1 : tensor<1x32x32x64xbf16> + } +} From e795e09c45c4e26c22eeb2ce0b8380d5607093a0 Mon Sep 17 00:00:00 2001 From: Milan Topalovic <163355844+mtopalovicTT@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:25:05 +0200 Subject: [PATCH 03/16] Adding negative dim support for `squeeze` (#561) Adding negative dim support for `squeeze` --- lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp | 4 ++++ lib/Dialect/TTIR/IR/TTIROps.cpp | 4 ++++ test/ttmlir/Dialect/TTNN/simple_squeeze.mlir | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp index 6f51eaa580..f5c3c460c5 100644 --- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp +++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp @@ -200,6 +200,10 @@ class SqueezeOpConversionPattern : public OpConversionPattern { // Get the squeeze dimension int32_t dim = adaptor.getDim(); + if (dim < 0) { + dim += inputType.getRank(); + } + // Get the shape of the input tensor auto inputShape = inputType.getShape(); llvm::SmallVector newShape; diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp index 49d2a97429..e26d50e8d8 100644 --- a/lib/Dialect/TTIR/IR/TTIROps.cpp +++ b/lib/Dialect/TTIR/IR/TTIROps.cpp @@ -267,6 +267,10 @@ ::mlir::LogicalResult mlir::tt::ttir::SqueezeOp::verify() { ::mlir::RankedTensorType outputType = getOutput().getType(); int32_t dim = getDim(); + if (dim < 0) { + dim += inputType.getRank(); + } + // Check that the dimension `dim` is valid. if (dim < 0 || dim >= inputType.getRank()) { return emitOpError() << "Invalid dimension " << dim << " for squeezing."; diff --git a/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir b/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir index 1798605d8b..34367c4736 100644 --- a/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir +++ b/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir @@ -4,7 +4,7 @@ module attributes {} { func.func @forward(%arg0: tensor<1x2x1x32x32xbf16>) -> tensor<1x2x32x32xbf16> { %0 = tensor.empty() : tensor<1x2x32x32xbf16> // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]] - %1 = "ttir.squeeze"(%arg0, %0) <{dim = 2 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<1x2x1x32x32xbf16>, tensor<1x2x32x32xbf16>) -> tensor<1x2x32x32xbf16> + %1 = "ttir.squeeze"(%arg0, %0) <{dim = -3 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<1x2x1x32x32xbf16>, tensor<1x2x32x32xbf16>) -> tensor<1x2x32x32xbf16> return %1 : tensor<1x2x32x32xbf16> } } From 8cc0f058037b7db39f0cf51c62d0c1311950b85f Mon Sep 17 00:00:00 2001 From: Tapasvi Patel <133996364+tapspatel@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:27:09 -0500 Subject: [PATCH 04/16] #445: Added load system desc path option in ttir to ttnn backend pipeline. Migrated more tests from ttir into ttnn silicon (#556) --- docs/src/ttrt.md | 2 + .../ttmlir/Dialect/TTNN/Pipelines/Passes.h | 7 ++ lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp | 6 +- .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir | 2 +- .../TTNN/eltwise/unary/simple_reciprocal.mlir | 15 ---- .../TTNN/eltwise/unary/simple_sqrt.mlir | 15 ---- .../TTNN/embedding/embedding_1d_tensor.mlir | 16 ++++ .../TTNN/embedding/embedding_non_tile.mlir | 16 ++++ .../TTNN/embedding/simple_embedding.mlir | 16 ++++ .../Silicon/TTNN/operand_broadcasts.mlir | 26 +++++++ test/ttmlir/Silicon/TTNN/simple_div.mlir | 16 ---- test/ttmlir/Silicon/TTNN/simple_eltwise.mlir | 74 ++++++++++++++++++- test/ttmlir/Silicon/TTNN/simple_ge.mlir | 16 ---- test/ttmlir/Silicon/TTNN/simple_matmul.mlir | 4 +- test/ttmlir/Silicon/TTNN/simple_mean.mlir | 16 ++++ test/ttmlir/Silicon/TTNN/simple_multiply.mlir | 16 ---- test/ttmlir/Silicon/TTNN/simple_nop.mlir | 1 - test/ttmlir/Silicon/TTNN/simple_relu.mlir | 16 ---- test/ttmlir/Silicon/TTNN/simple_subtract.mlir | 16 ---- test/ttmlir/Silicon/TTNN/simple_sum.mlir | 4 +- test/ttmlir/Silicon/TTNN/transpose.mlir | 33 +++++++++ 21 files changed, 213 insertions(+), 120 deletions(-) delete mode 100644 test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir delete mode 100644 test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir create mode 100644 test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir create mode 100644 test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir create mode 100644 test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir create mode 100644 test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir delete mode 100644 test/ttmlir/Silicon/TTNN/simple_div.mlir delete mode 100644 test/ttmlir/Silicon/TTNN/simple_ge.mlir create mode 100644 test/ttmlir/Silicon/TTNN/simple_mean.mlir delete mode 100644 test/ttmlir/Silicon/TTNN/simple_multiply.mlir delete mode 100644 test/ttmlir/Silicon/TTNN/simple_relu.mlir delete mode 100644 test/ttmlir/Silicon/TTNN/simple_subtract.mlir create mode 100644 test/ttmlir/Silicon/TTNN/transpose.mlir diff --git a/docs/src/ttrt.md b/docs/src/ttrt.md index 26be66bcff..297f505bd4 100644 --- a/docs/src/ttrt.md +++ b/docs/src/ttrt.md @@ -63,6 +63,8 @@ ttrt query --save-artifacts 4. Use ttmlir-opt tool in compiler to feed system descriptor. See the [ttmlir-opt](./ttmlir-opt.md) documentation for more information on how to generate .mlir files. ```bash ./build/bin/ttmlir-opt --ttir-load-system-desc="path=/path/to/system_desc.ttsys" --ttir-to-ttnn-backend-pipeline test/ttmlir/Dialect/TTNN/simple_subtract.mlir -o ttnn.mlir +or (pip path directly into ttir-to-ttnn-backend-pipeline) +./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=/path/to/system_desc.ttsys" test/ttmlir/Dialect/TTNN/simple_subtract.mlir -o ttnn.mlir ``` 5. Use ttmlir-translate tool in compiler to generate the flatbuffer executable. See the [ttmlir-translate](./ttmlir-translate.md) documentation for more information on how to generate flatbuffer files. ```bash diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h b/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h index 141745a1dc..c44dc99e94 100644 --- a/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h +++ b/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h @@ -87,6 +87,13 @@ struct TTIRToTTNNBackendPipelineOptions *this, "override-grid-sizes", llvm::cl::desc("Override grid sizes for specific ops."), llvm::cl::init(llvm::StringMap>())}; + + // Option to provide a system descriptor flatbuffer file to compile against + Option systemDescPath{ + *this, "system-desc-path", + llvm::cl::desc( + "Pass in a system descriptor flatbuffer to compile against."), + llvm::cl::init("")}; }; void createTTIRToTTNNBackendPipeline( diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp index 9a84025e6d..cb78dfd682 100644 --- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp +++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp @@ -15,7 +15,11 @@ namespace mlir::tt::ttnn { void createTTIRToTTNNBackendPipeline( OpPassManager &pm, const TTIRToTTNNBackendPipelineOptions &options) { - pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc()); + + ttir::TTIRLoadSystemDescOptions systemDescOptions; + systemDescOptions.path = options.systemDescPath; + pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc(systemDescOptions)); + pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice()); mlir::tt::ttir::TTIRLayoutOptions layoutOptions; layoutOptions.initMemorySpace = mlir::tt::MemorySpace::System; diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir index 560deec7d3..1664ceba3e 100644 --- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir +++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s +// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s // UNSUPPORTED: true #l1_ = #tt.memory_space diff --git a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir b/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir deleted file mode 100644 index 2239c3b0cc..0000000000 --- a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir +++ /dev/null @@ -1,15 +0,0 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline --ttir-load-system-desc="path=%system_desc_path%" %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] - // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] - %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: %[[C:.*]] = "ttnn.reciprocal"[[C:.*]] - %1 = "ttir.reciprocal"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - // CHECK: "ttnn.close_device"[[C:.*]] - return %1 : tensor<64x128xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir b/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir deleted file mode 100644 index 7a7111d338..0000000000 --- a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir +++ /dev/null @@ -1,15 +0,0 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline --ttir-load-system-desc="path=%system_desc_path%" %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] - // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] - %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: %[[C:.*]] = "ttnn.sqrt"[[C:.*]] - %1 = "ttir.sqrt"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - // CHECK: "ttnn.close_device"[[C:.*]] - return %1 : tensor<64x128xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir b/test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir new file mode 100644 index 0000000000..8c16055cf6 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir @@ -0,0 +1,16 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// UNSUPPORTED: true +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<32xf32>, %arg1: tensor<512x128xf32>) -> tensor<32x128xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<32x128xf32> + // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]] + %1 = "ttir.embedding"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32xf32>, tensor<512x128xf32>, tensor<32x128xf32>) -> tensor<32x128xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<32x128xf32> + } +} diff --git a/test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir b/test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir new file mode 100644 index 0000000000..c538cf9e41 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir @@ -0,0 +1,16 @@ +// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// UNSUPPORTED: true +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<1x32xf32>, %arg1: tensor<512x128xf32>) -> tensor<1x32x128xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<1x32x128xf32> + // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]] + %1 = "ttir.embedding"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32xf32>, tensor<512x128xf32>, tensor<1x32x128xf32>) -> tensor<1x32x128xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<1x32x128xf32> + } +} diff --git a/test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir b/test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir new file mode 100644 index 0000000000..4daa472fb4 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir @@ -0,0 +1,16 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// UNSUPPORTED: true +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<32x32xf32>, %arg1: tensor<512x128xf32>) -> tensor<32x32x128xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<32x32x128xf32> + // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]] + %1 = "ttir.embedding"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xf32>, tensor<512x128xf32>, tensor<32x32x128xf32>) -> tensor<32x32x128xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<32x32x128xf32> + } +} diff --git a/test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir b/test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir new file mode 100644 index 0000000000..1bcea0d1c3 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir @@ -0,0 +1,26 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +#any_device = #tt.operand_constraint +module attributes {} { + func.func @bcast_one_dim(%arg0: tensor<2x64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<2x64x128xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<2x64x128xf32> + // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]] + %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2x64x128xf32>, tensor<64x128xf32>, tensor<2x64x128xf32>) -> tensor<2x64x128xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<2x64x128xf32> + } + + func.func @bcast_multi_dim(%arg0: tensor<17x16x15x14xf32>, %arg1: tensor<15x1xf32>) -> tensor<17x16x15x14xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<17x16x15x14xf32> + // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]] + %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<17x16x15x14xf32>, tensor<15x1xf32>, tensor<17x16x15x14xf32>) -> tensor<17x16x15x14xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<17x16x15x14xf32> + } + +} diff --git a/test/ttmlir/Silicon/TTNN/simple_div.mlir b/test/ttmlir/Silicon/TTNN/simple_div.mlir deleted file mode 100644 index f5c7ee878d..0000000000 --- a/test/ttmlir/Silicon/TTNN/simple_div.mlir +++ /dev/null @@ -1,16 +0,0 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] - // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] - %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: %[[C:.*]] = "ttnn.div"[[C:.*]] - %1 = "ttir.div"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - // CHECK: "ttnn.close_device"[[C:.*]] - return %1 : tensor<64x128xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir index ed3829935c..6afbe4d8e1 100644 --- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir @@ -1,8 +1,8 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - #any_device = #tt.operand_constraint +#any_device_tile = #tt.operand_constraint func.func @subtract(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] @@ -53,3 +53,73 @@ func.func @ge(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64 // CHECK: "ttnn.close_device"[[C:.*]] return %1 : tensor<64x128xf32> } + +func.func @concat(%arg0: tensor<32x32xf32>, %arg1: tensor<32x64xf32>) -> tensor<32x96xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<32x96xf32> + // CHECK: %[[C:.*]] = "ttnn.concat"[[C:.*]] + %1 = "ttir.concat"(%arg0, %arg1, %0) <{dim = 1 : si32, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xf32>, tensor<32x64xf32>, tensor<32x96xf32>) -> tensor<32x96xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<32x96xf32> +} + +func.func @reshape(%arg0: tensor<4x2x32x32xbf16>) -> tensor<2x4x32x32xbf16> { + %0 = tensor.empty() : tensor<2x4x32x32xbf16> + // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]] + %1 = "ttir.reshape"(%arg0, %0) <{shape = [2: i32, 4: i32, 32: i32, 32: i32] , operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<4x2x32x32xbf16>, tensor<2x4x32x32xbf16>) -> tensor<2x4x32x32xbf16> + return %1 : tensor<2x4x32x32xbf16> +} + +func.func @squeeze(%arg0: tensor<1x2x1x32x32xbf16>) -> tensor<1x2x32x32xbf16> { + %0 = tensor.empty() : tensor<1x2x32x32xbf16> + // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]] + %1 = "ttir.squeeze"(%arg0, %0) <{dim = 2 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<1x2x1x32x32xbf16>, tensor<1x2x32x32xbf16>) -> tensor<1x2x32x32xbf16> + return %1 : tensor<1x2x32x32xbf16> +} + +func.func @reciprocal(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttnn.reciprocal"[[C:.*]] + %1 = "ttir.reciprocal"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<64x128xf32> +} + +func.func @sigmoid(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttnn.sigmoid"[[C:.*]] + %1 = "ttir.sigmoid"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<64x128xf32> +} + +func.func @sqrt(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<64x128xf32> + // CHECK: %[[C:.*]] = "ttnn.sqrt"[[C:.*]] + %1 = "ttir.sqrt"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<64x128xf32> +} + +func.func @softmax(%arg0: tensor<512x1024xbf16>) -> tensor<512x1024xbf16> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<512x1024xbf16> + // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]] + // Check for positive dimension attribute + %1 = "ttir.softmax"(%arg0, %0) <{dimension = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16> + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %2 = tensor.empty() : tensor<512x1024xbf16> + // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]] + // Check for negative dimension attribute + %3 = "ttir.softmax"(%1, %2) <{dimension = -1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16> + // CHECK: "ttnn.close_device"[[C:.*]] + return %3 : tensor<512x1024xbf16> +} diff --git a/test/ttmlir/Silicon/TTNN/simple_ge.mlir b/test/ttmlir/Silicon/TTNN/simple_ge.mlir deleted file mode 100644 index c2efad81c2..0000000000 --- a/test/ttmlir/Silicon/TTNN/simple_ge.mlir +++ /dev/null @@ -1,16 +0,0 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] - // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] - %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: %[[C:.*]] = "ttnn.ge"[[C:.*]] - %1 = "ttir.ge"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - // CHECK: "ttnn.close_device"[[C:.*]] - return %1 : tensor<64x128xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir index c97518ce7d..a90e7817b5 100644 --- a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir @@ -1,8 +1,8 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir +// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - #any_device_tile = #tt.operand_constraint +// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>> module attributes {} { func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> { %0 = tensor.empty() : tensor<64x96xbf16> diff --git a/test/ttmlir/Silicon/TTNN/simple_mean.mlir b/test/ttmlir/Silicon/TTNN/simple_mean.mlir new file mode 100644 index 0000000000..c3705a6ff0 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/simple_mean.mlir @@ -0,0 +1,16 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// UNSUPPORTED: true +#any_device = #tt.operand_constraint +module { + func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> { + // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] + // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] + %0 = tensor.empty() : tensor<512x32xbf16> + // CHECK: %[[C:.*]] = "ttnn.mean"[[C:.*]] + %1 = "ttir.mean"(%arg0, %0) <{dim_arg = [-1: i32], keep_dim = true, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x32xbf16>) -> tensor<512x32xbf16> + // CHECK: "ttnn.close_device"[[C:.*]] + return %1 : tensor<512x32xbf16> + } +} diff --git a/test/ttmlir/Silicon/TTNN/simple_multiply.mlir b/test/ttmlir/Silicon/TTNN/simple_multiply.mlir deleted file mode 100644 index 91792a2ceb..0000000000 --- a/test/ttmlir/Silicon/TTNN/simple_multiply.mlir +++ /dev/null @@ -1,16 +0,0 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] - // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] - %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]] - %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - // CHECK: "ttnn.close_device"[[C:.*]] - return %1 : tensor<64x128xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/simple_nop.mlir b/test/ttmlir/Silicon/TTNN/simple_nop.mlir index 0bce6b0a15..7cf9b1bd20 100644 --- a/test/ttmlir/Silicon/TTNN/simple_nop.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_nop.mlir @@ -1,7 +1,6 @@ // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - module @jit_convert_element_type attributes {mhlo.num_partitions = 1 : i32, mhlo.num_replicas = 1 : i32} { func.func public @main(%arg0: tensor<2x2xf32> {mhlo.layout_mode = "default"}) -> (tensor<2x2xf32> {jax.result_info = "", mhlo.layout_mode = "default"}) { // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] diff --git a/test/ttmlir/Silicon/TTNN/simple_relu.mlir b/test/ttmlir/Silicon/TTNN/simple_relu.mlir deleted file mode 100644 index c53100894b..0000000000 --- a/test/ttmlir/Silicon/TTNN/simple_relu.mlir +++ /dev/null @@ -1,16 +0,0 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] - // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] - %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: %[[C:.*]] = "ttnn.relu"[[C:.*]] - %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - // CHECK: "ttnn.close_device"[[C:.*]] - return %1 : tensor<64x128xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/simple_subtract.mlir b/test/ttmlir/Silicon/TTNN/simple_subtract.mlir deleted file mode 100644 index b15f3d2775..0000000000 --- a/test/ttmlir/Silicon/TTNN/simple_subtract.mlir +++ /dev/null @@ -1,16 +0,0 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn - -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] - // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] - %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: %[[C:.*]] = "ttnn.subtract"[[C:.*]] - %1 = "ttir.subtract"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> - // CHECK: "ttnn.close_device"[[C:.*]] - return %1 : tensor<64x128xf32> - } -} diff --git a/test/ttmlir/Silicon/TTNN/simple_sum.mlir b/test/ttmlir/Silicon/TTNN/simple_sum.mlir index a976499643..9af10c8a8b 100644 --- a/test/ttmlir/Silicon/TTNN/simple_sum.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_sum.mlir @@ -1,9 +1,7 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn -// https://github.com/tenstorrent/tt-mlir/issues/528 // UNSUPPORTED: true - #any_device = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> { diff --git a/test/ttmlir/Silicon/TTNN/transpose.mlir b/test/ttmlir/Silicon/TTNN/transpose.mlir new file mode 100644 index 0000000000..184b6b8076 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/transpose.mlir @@ -0,0 +1,33 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +#any_device = #tt.operand_constraint +#any_device_tile = #tt.operand_constraint + +func.func @transpose(%arg0: tensor<64x128xbf16>) -> tensor<128x64xbf16> { + %0 = tensor.empty() : tensor<128x64xbf16> + // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]] + %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 0 : si32, dim1 = 1 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>) -> tensor<128x64xbf16> + return %1 : tensor<128x64xbf16> +} + +func.func @transpose_8x8(%arg0: tensor<32x32xbf16>) -> tensor<32x32xbf16> { + %0 = tensor.empty() : tensor<32x32xbf16> + // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]] + %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 0 : si32, dim1 = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + return %1 : tensor<32x32xbf16> +} + +func.func @transpose_8x16_reverse_dims(%arg0: tensor<64x16xbf16>) -> tensor<16x64xbf16> { + %0 = tensor.empty() : tensor<16x64xbf16> + // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]] + %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 1 : si32, dim1 = 0 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<64x16xbf16>, tensor<16x64xbf16>) -> tensor<16x64xbf16> + return %1 : tensor<16x64xbf16> +} + +func.func @transpose_negative_dims(%arg0: tensor<32x32xbf16>) -> tensor<32x32xbf16> { + %0 = tensor.empty() : tensor<32x32xbf16> + // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]] + %1 = "ttir.transpose"(%arg0, %0) <{dim0 = -1 : si32, dim1 = -2 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + return %1 : tensor<32x32xbf16> +} From 19c54072584ae3ba0c5be20c536b3a82c1e42cab Mon Sep 17 00:00:00 2001 From: Tapasvi Patel <133996364+tapspatel@users.noreply.github.com> Date: Fri, 30 Aug 2024 16:13:46 -0500 Subject: [PATCH 05/16] #563: Enable mixed ttnn and ttm runtime in ttrt (#565) --- runtime/tools/python/CMakeLists.txt | 2 +- runtime/tools/python/ttrt/common/api.py | 173 ++++++++++++------------ 2 files changed, 89 insertions(+), 86 deletions(-) diff --git a/runtime/tools/python/CMakeLists.txt b/runtime/tools/python/CMakeLists.txt index 791541cbb1..84810a4cf3 100644 --- a/runtime/tools/python/CMakeLists.txt +++ b/runtime/tools/python/CMakeLists.txt @@ -4,7 +4,7 @@ add_custom_target(ttrt-copy-files ) add_custom_target(ttrt - COMMAND rm -f *.whl + COMMAND rm -f build/*.whl COMMAND TTMLIR_ENABLE_RUNTIME=${TTMLIR_ENABLE_RUNTIME} TT_RUNTIME_ENABLE_TTNN=${TT_RUNTIME_ENABLE_TTNN} TT_RUNTIME_ENABLE_TTMETAL=${TT_RUNTIME_ENABLE_TTMETAL} diff --git a/runtime/tools/python/ttrt/common/api.py b/runtime/tools/python/ttrt/common/api.py index f2c63a80ac..4b2d3e4185 100644 --- a/runtime/tools/python/ttrt/common/api.py +++ b/runtime/tools/python/ttrt/common/api.py @@ -793,112 +793,115 @@ def _execute(binaries): self.logging.debug(f"setting torch manual seed={self['seed']}") torch.manual_seed(self["seed"]) ttrt.runtime.set_compatible_runtime(binaries[0].fbb) - self.logging.debug(f"opening device id={self.query.device_ids[0]}") device = ttrt.runtime.open_device([self.query.device_ids[0]]) - atexit.register(lambda: ttrt.runtime.close_device(device)) - - for bin in binaries: - self.logging.info(f"evaluating binary={bin.file_path}") - - program_indices = [] - if self["program_index"] == "all": - program_indices.extend(range(bin.get_num_programs())) - else: - program_indices.append(int(self["program_index"])) - for program_index in program_indices: - self.logging.debug( - f"evaluating program={program_index} for binary={bin.file_path}" - ) + try: + for bin in binaries: + self.logging.info(f"evaluating binary={bin.file_path}") - program = bin.get_program(program_index) - program.populate_inputs( - API.Run.TorchInitilizer.get_initilizer(self["init"]) - ) - program.populate_outputs( - API.Run.TorchInitilizer.get_initilizer("zeros") - ) + program_indices = [] + if self["program_index"] == "all": + program_indices.extend(range(bin.get_num_programs())) + else: + program_indices.append(int(self["program_index"])) - total_inputs = [] - total_outputs = [] - for loop in range(self["loops"]): + for program_index in program_indices: self.logging.debug( - f"generating inputs/outputs for loop={loop+1}/{self['loops']} for binary={bin.file_path}" + f"evaluating program={program_index} for binary={bin.file_path}" ) - inputs = [] - outputs = [] - for i in program.input_tensors: - inputs.append( - ttrt.runtime.create_tensor( - i.data_ptr(), - list(i.shape), - list(i.stride()), - i.element_size(), - Binary.Program.to_data_type(i.dtype), - ) + program = bin.get_program(program_index) + program.populate_inputs( + API.Run.TorchInitilizer.get_initilizer(self["init"]) + ) + program.populate_outputs( + API.Run.TorchInitilizer.get_initilizer("zeros") + ) + + total_inputs = [] + total_outputs = [] + for loop in range(self["loops"]): + self.logging.debug( + f"generating inputs/outputs for loop={loop+1}/{self['loops']} for binary={bin.file_path}" ) - for i in program.output_tensors: - outputs.append( - ttrt.runtime.create_tensor( - i.data_ptr(), - list(i.shape), - list(i.stride()), - i.element_size(), - Binary.Program.to_data_type(i.dtype), + inputs = [] + outputs = [] + for i in program.input_tensors: + inputs.append( + ttrt.runtime.create_tensor( + i.data_ptr(), + list(i.shape), + list(i.stride()), + i.element_size(), + Binary.Program.to_data_type(i.dtype), + ) ) - ) - total_inputs.append(inputs) - total_outputs.append(outputs) + for i in program.output_tensors: + outputs.append( + ttrt.runtime.create_tensor( + i.data_ptr(), + list(i.shape), + list(i.stride()), + i.element_size(), + Binary.Program.to_data_type(i.dtype), + ) + ) - event = None - for loop in range(self["loops"]): - self.logging.debug( - f"starting loop={loop+1}/{self['loops']} for binary={bin.file_path}" - ) + total_inputs.append(inputs) + total_outputs.append(outputs) - event = ttrt.runtime.submit( - device, - bin.fbb, - program_index, - total_inputs[loop], - total_outputs[loop], - ) + event = None + for loop in range(self["loops"]): + self.logging.debug( + f"starting loop={loop+1}/{self['loops']} for binary={bin.file_path}" + ) - self.logging.debug( - f"finished loop={loop+1}/{self['loops']} for binary={bin.file_path}" - ) + event = ttrt.runtime.submit( + device, + bin.fbb, + program_index, + total_inputs[loop], + total_outputs[loop], + ) - ttrt.runtime.wait(event) + self.logging.debug( + f"finished loop={loop+1}/{self['loops']} for binary={bin.file_path}" + ) - if self["identity"]: - self.logging.debug( - f"checking identity with rtol={self['rtol']} and atol={self['atol']}" - ) + ttrt.runtime.wait(event) - for i, o in zip( - program.input_tensors, program.output_tensors - ): - if not torch.allclose( - i, o, rtol=self["rtol"], atol=self["atol"] + if self["identity"]: + self.logging.debug( + f"checking identity with rtol={self['rtol']} and atol={self['atol']}" + ) + + for i, o in zip( + program.input_tensors, program.output_tensors ): - self.logging.error( - f"Failed: inputs and outputs do not match in binary" - ) - self.logging.error(i - o) + if not torch.allclose( + i, o, rtol=self["rtol"], atol=self["atol"] + ): + self.logging.error( + f"Failed: inputs and outputs do not match in binary" + ) + self.logging.error(i - o) - self.logging.debug(f"input tensors for program={program_index}") - for tensor in program.input_tensors: - self.logging.debug(f"{tensor}\n") + self.logging.debug( + f"input tensors for program={program_index}" + ) + for tensor in program.input_tensors: + self.logging.debug(f"{tensor}\n") - self.logging.debug( - f"output tensors for program={program_index}" - ) - for tensor in program.output_tensors: - self.logging.debug(f"{tensor}\n") + self.logging.debug( + f"output tensors for program={program_index}" + ) + for tensor in program.output_tensors: + self.logging.debug(f"{tensor}\n") + finally: + ttrt.runtime.close_device(device) self.logging.debug(f"executing ttnn binaries") _execute(self.ttnn_binaries) From 0b83c39218dbfd8bfa5242b24a3bfda352188ddb Mon Sep 17 00:00:00 2001 From: Nick Smith <127986401+nsmithtt@users.noreply.github.com> Date: Sat, 31 Aug 2024 07:36:58 -0700 Subject: [PATCH 06/16] Add a new TTIR Layout pass option defaultMemorySpace (#564) --- .../ttmlir/Dialect/TTIR/Transforms/Passes.td | 4 ++ lib/Dialect/TTIR/Transforms/Passes.cpp | 47 +++++++++++++++---- lib/Dialect/TTMetal/Transforms/Passes.cpp | 1 + lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp | 1 + test/ttmlir/Dialect/TTIR/test_grid_set.mlir | 2 +- .../Dialect/TTNN/multiple_add_with_loc.mlir | 2 +- .../multiple_add_with_loc_grid_override.mlir | 4 +- test/ttmlir/Dialect/TTNN/simple_matmul.mlir | 2 +- .../Dialect/TTNN/ttir_to_ttnn_pipeline.mlir | 2 +- .../ttir_to_ttnn_pipeline_custom_opt.mlir | 2 +- 10 files changed, 50 insertions(+), 17 deletions(-) diff --git a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td index de7ac591af..c5a67e76c5 100644 --- a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td +++ b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td @@ -47,6 +47,10 @@ def TTIRLayout: Pass<"ttir-layout", "::mlir::ModuleOp"> { "::mlir::tt::MemorySpace", /*default=*/"::mlir::tt::MemorySpace::System", "Set the initial memory space for tensors to start in">, + Option<"defaultMemorySpace", "default-memory-space", + "::mlir::tt::MemorySpace", + /*default=*/"::mlir::tt::MemorySpace::DeviceDRAM", + "Set the default memory space for layout pass to prefer for operation operands, if not constrained">, ]; } diff --git a/lib/Dialect/TTIR/Transforms/Passes.cpp b/lib/Dialect/TTIR/Transforms/Passes.cpp index bf303f65e5..9e77b4c66c 100644 --- a/lib/Dialect/TTIR/Transforms/Passes.cpp +++ b/lib/Dialect/TTIR/Transforms/Passes.cpp @@ -420,13 +420,31 @@ inline MemorySpace getMemorySpace(RankedTensorType ty) { return getMemorySpace(layout); } -inline MemorySpace uppermostMemorySpace(OperandConstraint operandConstraint) { - if (bitEnumContainsAny(operandConstraint, OperandConstraint::L1)) { - return MemorySpace::DeviceL1; +inline OperandConstraint +memorySpaceAsOperandConstraint(MemorySpace memorySpace) { + switch (memorySpace) { + case MemorySpace::System: + case MemorySpace::SystemMMIO: + return OperandConstraint::System; + case MemorySpace::DeviceDRAM: + return OperandConstraint::DRAM; + case MemorySpace::DeviceL1: + return OperandConstraint::L1; + } +} + +inline MemorySpace getLegalMemorySpace(OperandConstraint operandConstraint, + MemorySpace defaultMemorySpace) { + if (bitEnumContainsAny(operandConstraint, + memorySpaceAsOperandConstraint(defaultMemorySpace))) { + return defaultMemorySpace; } if (bitEnumContainsAny(operandConstraint, OperandConstraint::DRAM)) { return MemorySpace::DeviceDRAM; } + if (bitEnumContainsAny(operandConstraint, OperandConstraint::L1)) { + return MemorySpace::DeviceL1; + } return MemorySpace::System; } @@ -547,8 +565,10 @@ static std::optional createToLayoutOp(PatternRewriter &rewriter, static std::optional createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input, - OperandConstraint operandConstraint) { - auto desiredMemorySpace = uppermostMemorySpace(operandConstraint); + OperandConstraint operandConstraint, + MemorySpace defaultMemorySpace) { + auto desiredMemorySpace = + getLegalMemorySpace(operandConstraint, defaultMemorySpace); bool tiled = !bitEnumContainsAny(operandConstraint, OperandConstraint::Scalar); return createToLayoutOp(rewriter, loc, input, desiredMemorySpace, tiled); @@ -557,8 +577,10 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input, class TTIRLayoutDPSOperandsRewriter : public OpInterfaceRewritePattern { public: - using OpInterfaceRewritePattern< - DestinationStyleOpInterface>::OpInterfaceRewritePattern; + TTIRLayoutDPSOperandsRewriter(MLIRContext *ctx, + MemorySpace defaultMemorySpace) + : OpInterfaceRewritePattern(ctx), + defaultMemorySpace(defaultMemorySpace) {} LogicalResult matchAndRewrite(DestinationStyleOpInterface op, PatternRewriter &rewriter) const final { @@ -582,8 +604,9 @@ class TTIRLayoutDPSOperandsRewriter mlir::cast(op.getOperation()) .getOperandConstraints()[operand.getOperandNumber()]) .getValue(); - auto desiredLayout = createToLayoutOp(rewriter, op.getLoc(), - operand.get(), operandConstraint); + auto desiredLayout = + createToLayoutOp(rewriter, op.getLoc(), operand.get(), + operandConstraint, defaultMemorySpace); if (desiredLayout) { rewriter.modifyOpInPlace(op, [&]() { @@ -599,6 +622,9 @@ class TTIRLayoutDPSOperandsRewriter return modified ? success() : failure(); } + +private: + MemorySpace defaultMemorySpace; }; class TTIRLayoutFuncReturnRewriter @@ -650,7 +676,8 @@ class TTIRLayout : public impl::TTIRLayoutBase { } { RewritePatternSet patterns(&getContext()); - patterns.add(&getContext()); + patterns.add(&getContext(), + defaultMemorySpace); patterns.add(&getContext(), initMemorySpace); FrozenRewritePatternSet patternSet(std::move(patterns)); diff --git a/lib/Dialect/TTMetal/Transforms/Passes.cpp b/lib/Dialect/TTMetal/Transforms/Passes.cpp index 4146db7c20..76f7763a8c 100644 --- a/lib/Dialect/TTMetal/Transforms/Passes.cpp +++ b/lib/Dialect/TTMetal/Transforms/Passes.cpp @@ -840,6 +840,7 @@ void createTTIRToTTMetalBackendPipeline(OpPassManager &pm) { pm.addPass(mlir::tt::ttir::createTTIRGenericRegion()); mlir::tt::ttir::TTIRLayoutOptions layoutOptions; layoutOptions.initMemorySpace = mlir::tt::MemorySpace::DeviceL1; + layoutOptions.defaultMemorySpace = mlir::tt::MemorySpace::DeviceL1; pm.addPass(mlir::tt::ttir::createTTIRLayout(layoutOptions)); pm.addPass(mlir::tt::ttir::createTTIRGenericRegionOperandsToMemref()); pm.addPass(mlir::tt::ttir::createTTIRAllocate()); diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp index cb78dfd682..56c05e8fe9 100644 --- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp +++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp @@ -23,6 +23,7 @@ void createTTIRToTTNNBackendPipeline( pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice()); mlir::tt::ttir::TTIRLayoutOptions layoutOptions; layoutOptions.initMemorySpace = mlir::tt::MemorySpace::System; + layoutOptions.defaultMemorySpace = mlir::tt::MemorySpace::DeviceDRAM; pm.addPass(mlir::tt::ttir::createTTIRLayout(layoutOptions)); if (options.gridSetPassEnabled) { diff --git a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir index bf6eae61e9..0860ff4dab 100644 --- a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir +++ b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir @@ -3,7 +3,7 @@ module attributes {} { func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { %0 = tensor.empty() : tensor<64x128xf32> - // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1_>> + // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>> // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]] -> tensor<64x128xf32, #[[LAYOUT_1]]> %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32> return %1 : tensor<64x128xf32> diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir index 5ba74e6f68..a8616f152b 100644 --- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir +++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir @@ -3,7 +3,7 @@ #loc = loc("test_ops.py:17_0_0":0:0) module @pybuda_graph attributes {} { func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) { - // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #l1_>> + // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>> %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5) // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_1]]> %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5) diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir index ae356c4811..adf62660bc 100644 --- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir +++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir @@ -4,8 +4,8 @@ module @pybuda_graph attributes {} { func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) { // CHECK: #[[LAYOUT_0:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>> - // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #l1_>> - // CHECK: #[[LAYOUT_2:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #l1_>> + // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #dram>> + // CHECK: #[[LAYOUT_2:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>> %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5) // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_1]]> %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5) diff --git a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir index 992b0c21db..f8ee937e74 100644 --- a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir +++ b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir @@ -1,6 +1,6 @@ // RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s #any_device_tile = #tt.operand_constraint -// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>> +// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>> module attributes {} { func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> { %0 = tensor.empty() : tensor<64x96xbf16> diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir index 00c67542ad..cfdfde2d14 100644 --- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir +++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir @@ -2,7 +2,7 @@ #any_device = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1_>> + // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>> // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] %0 = tensor.empty() : tensor<64x128xf32> diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir index 7b1d1ee475..e1acc7c802 100644 --- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir +++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir @@ -2,7 +2,7 @@ #any_device = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { - // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>> + // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #dram>> // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]] // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]] %0 = tensor.empty() : tensor<64x128xf32> From ab9a8b29dc8276554a07d837ddf9eccbcda3b7cf Mon Sep 17 00:00:00 2001 From: Nick Smith <127986401+nsmithtt@users.noreply.github.com> Date: Sat, 31 Aug 2024 11:13:33 -0700 Subject: [PATCH 07/16] Build fix (#567) --- test/ttmlir/Silicon/TTNN/simple_matmul.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir index a90e7817b5..fdee7305f9 100644 --- a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir @@ -2,7 +2,7 @@ // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn #any_device_tile = #tt.operand_constraint -// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>> +// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>> module attributes {} { func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> { %0 = tensor.empty() : tensor<64x96xbf16> From fd466fc84a250a348c61f96d834ab3885aa2a958 Mon Sep 17 00:00:00 2001 From: Kyle Mabee <118925087+kmabeeTT@users.noreply.github.com> Date: Sat, 31 Aug 2024 18:47:12 -0400 Subject: [PATCH 08/16] Various runtime::ttmetal CQExecutor buffer map improvements / segfault workaround / TTMetal Tests (#529) * A pair of runtime::ttmetal CQExecutor buffer map improvements (#408) - Prevent duplicate Buffers from being created inside CreateBufferCommand handler by checking for existence in buffers umap. - Change to use buffers.erase() in DeallocateBufferCommand to actually remove the entry from buffers umap. Buffer will still be destroyed because it goes out of scope. - Neither of these help with the original segfault in this ticket but these were found through visual observation. * Temporary Workaround for tt-metal Segfaults during teardown (#408) - A hack, in createBufferFromTensorRef(), remove when proper bug fix is made in tt-metal and propagates here. * Remove UNSUPPORTED: true flag from tests now that CI mixing tests issue resolved - Update test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir with CHECK to avoid errors and add missing flag --ttmetal-serialize-to-binary to run ttm binary in CI --- runtime/include/tt/runtime/detail/ttmetal.h | 13 ++++++++++ runtime/lib/ttmetal/command_queue.cpp | 8 +++--- .../Silicon/TTMetal/simple_eltwise.mlir | 1 - .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir | 25 +++++++++++++++++-- test/ttmlir/Silicon/TTMetal/to_layout.mlir | 1 - 5 files changed, 41 insertions(+), 7 deletions(-) diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h index 964caa6a5e..b79bde0e14 100644 --- a/runtime/include/tt/runtime/detail/ttmetal.h +++ b/runtime/include/tt/runtime/detail/ttmetal.h @@ -161,6 +161,19 @@ createBufferFromTensorRef(::tt::tt_metal::Device *device, std::shared_ptr<::tt::tt_metal::Buffer> buffer = ::tt::tt_metal::CreateBuffer(shardedBufferConfig); assert(tensorRef->address()); + + // Issue #408: Temporary Hack, remove when fix available. + // Update tt-metal BUFFER_MAP with updated address and remove + // entry for original alloc'd address. + auto &buffer_map = tt::tt_metal::detail::BUFFER_MAP; + auto map_copy = buffer_map.value(); + auto old_key = std::make_tuple(device->id(), buffer->address()); + if (auto it = map_copy.find(old_key); it != map_copy.end()) { + auto new_key = std::make_tuple(device->id(), tensorRef->address()); + buffer_map.insert(new_key, it->second); + buffer_map.erase(old_key); + } + buffer->set_address(tensorRef->address()); return buffer; } diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp index 5899896530..d93e012c74 100644 --- a/runtime/lib/ttmetal/command_queue.cpp +++ b/runtime/lib/ttmetal/command_queue.cpp @@ -242,8 +242,10 @@ void CQExecutor::execute( void CQExecutor::execute( ::tt::target::metal::CreateBufferCommand const *command) { - buffers[command->ref()->global_id()] = - createBufferFromTensorRef(device, command->ref()); + if (buffers.find(command->ref()->global_id()) == buffers.end()) { + buffers[command->ref()->global_id()] = + createBufferFromTensorRef(device, command->ref()); + } } void CQExecutor::execute( @@ -252,7 +254,7 @@ void CQExecutor::execute( assert(iter != buffers.end() && "Buffer not allocated"); assert(iter->second != nullptr && "Buffer already deallocated"); ::tt::tt_metal::DeallocateBuffer(*iter->second); - iter->second.reset(); + buffers.erase(iter); } void CQExecutor::execute( diff --git a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir index 494e3f19a6..fdd65864df 100644 --- a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir +++ b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir @@ -1,5 +1,4 @@ // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s -// UNSUPPORTED: true #any_device = #tt.operand_constraint func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir index 1664ceba3e..1cebfe4515 100644 --- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir +++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir @@ -1,5 +1,4 @@ -// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s -// UNSUPPORTED: true +// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s #l1_ = #tt.memory_space #untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>> @@ -7,11 +6,17 @@ #tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>> #untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>> func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized2x2> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %0 = tensor.empty() : tensor<64x128xf32, #tilized> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #untilized>, tensor<64x128xf32, #tilized>) -> tensor<64x128xf32, #tilized> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %2 = tensor.empty() : tensor<64x128xf32, #tilized2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %3 = "ttir.to_layout"(%1, %2) : (tensor<64x128xf32, #tilized>, tensor<64x128xf32, #tilized2x2>) -> tensor<64x128xf32, #tilized2x2> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %4 = tensor.empty() : tensor<64x128xf32, #untilized2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %5 = "ttir.to_layout"(%3, %4) : (tensor<64x128xf32, #tilized2x2>, tensor<64x128xf32, #untilized2x2>) -> tensor<64x128xf32, #untilized2x2> return %5 : tensor<64x128xf32, #untilized2x2> } @@ -22,13 +27,19 @@ func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64 #tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>> #untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>> func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> { + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %0 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %1 = "ttir.to_layout"(%arg0, %0) : (tensor<2x3x64x128xf32, #untilized4D>, tensor<2x3x64x128xf32, #tilized4D>) -> tensor<2x3x64x128xf32, #tilized4D> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %2 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D_2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %3 = "ttir.to_layout"(%1, %2) : (tensor<2x3x64x128xf32, #tilized4D>, tensor<2x3x64x128xf32, #tilized4D_2x2>) -> tensor<2x3x64x128xf32, #tilized4D_2x2> + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %4 = tensor.empty() : tensor<2x3x64x128xf32, #untilized4D_2x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %5 = "ttir.to_layout"(%3, %4) : (tensor<2x3x64x128xf32, #tilized4D_2x2>, tensor<2x3x64x128xf32, #untilized4D_2x2>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> return %5 : tensor<2x3x64x128xf32, #untilized4D_2x2> @@ -40,23 +51,33 @@ func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> ten #tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>> func.func @tilize_reblock_big(%arg0: tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> { // move to tilized 1x1 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %0 = tensor.empty() : tensor<96x192xf32, #tilized_big> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %1 = "ttir.to_layout"(%arg0, %0) : (tensor<96x192xf32, #untilized_big>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big> // move to tilized 2x3 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %2 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x2> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %3 = "ttir.to_layout"(%1, %2) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #tilized_big_3x2>) -> tensor<96x192xf32, #tilized_big_3x2> // move to tilized 3x3 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %4 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x6> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %5 = "ttir.to_layout"(%3, %4) : (tensor<96x192xf32, #tilized_big_3x2>, tensor<96x192xf32, #tilized_big_3x6>) -> tensor<96x192xf32, #tilized_big_3x6> // move back to tilized 1x1 + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %6 = tensor.empty() : tensor<96x192xf32, #tilized_big> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %7 = "ttir.to_layout"(%5, %6) : (tensor<96x192xf32, #tilized_big_3x6>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big> // untilize + // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]] %8 = tensor.empty() : tensor<96x192xf32, #untilized_big> + // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]] %9 = "ttir.to_layout"(%7, %8) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> return %9 : tensor<96x192xf32, #untilized_big> diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir index 6b361a76da..f268e7b397 100644 --- a/test/ttmlir/Silicon/TTMetal/to_layout.mlir +++ b/test/ttmlir/Silicon/TTMetal/to_layout.mlir @@ -1,5 +1,4 @@ // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s -// UNSUPPORTED: true #l1_ = #tt.memory_space #layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>> From 85c081f05479c421c876e2a21ec186f317e61049 Mon Sep 17 00:00:00 2001 From: Nick Smith <127986401+nsmithtt@users.noreply.github.com> Date: Mon, 2 Sep 2024 18:18:22 -0700 Subject: [PATCH 09/16] Add system desc attribute dram_unreserved_end (#540) Calculate the end of the DRAM region that is not usable by compiler. This upper region of memory is where kernel programs get allocated to. This calculation intends to estimate some conservative max number, but still needs a mechanism to enforce during runtime #539. --- include/ttmlir/Dialect/TT/IR/TTOpsTypes.td | 4 ++- include/ttmlir/Target/Common/types.fbs | 1 + .../ttmlir/Target/Utils/MLIRToFlatbuffer.h | 1 + lib/CAPI/TTAttrs.cpp | 6 ++-- lib/Dialect/TT/IR/TTOpsTypes.cpp | 6 ++-- python/TTModule.cpp | 6 ++-- runtime/lib/common/system_desc.cpp | 31 +++++++++++++++++-- 7 files changed, 43 insertions(+), 12 deletions(-) diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td index 6f55d29f62..35c6110767 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td +++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td @@ -107,6 +107,7 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> { "unsigned":$l1UnreservedBase, "unsigned":$eriscL1UnreservedBase, "unsigned":$dramUnreservedBase, + "unsigned":$dramUnreservedEnd, "ChipPhysicalCoresAttr":$chipPhysicalCores, ArrayRefParameter<"DataTypeAttr">:$supportedDataTypes, ArrayRefParameter<"TileSizeAttr">:$supportedTileSizes); @@ -121,13 +122,14 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> { `l1_unreserved_base` `=` $l1UnreservedBase `,` `erisc_l1_unreserved_base` `=` $eriscL1UnreservedBase `,` `dram_unreserved_base` `=` $dramUnreservedBase `,` + `dram_unreserved_end` `=` $dramUnreservedEnd `,` `physical_cores` `=` $chipPhysicalCores `,` `supported_data_types` `=` `[` $supportedDataTypes `]` `,` `supported_tile_sizes` `=` `[` $supportedTileSizes `]` `}`}]; let extraClassDeclaration = [{ unsigned getUsableL1Size() const { return getL1Size() - getL1UnreservedBase(); } - unsigned getUsableDramChannelSize() const { return getDramChannelSize() - getDramUnreservedBase(); } + unsigned getUsableDramChannelSize() const { return getDramUnreservedEnd() - getDramUnreservedBase(); } }]; } diff --git a/include/ttmlir/Target/Common/types.fbs b/include/ttmlir/Target/Common/types.fbs index 42a8287611..f3d588c316 100644 --- a/include/ttmlir/Target/Common/types.fbs +++ b/include/ttmlir/Target/Common/types.fbs @@ -105,6 +105,7 @@ table ChipDesc { l1_unreserved_base: uint32; erisc_l1_unreserved_base: uint32; dram_unreserved_base: uint32; + dram_unreserved_end: uint32; physical_cores: ChipPhysicalCores; supported_data_types: [DataType]; supported_tile_sizes: [Dim2d]; diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h index fa8e67466e..b56834d25a 100644 --- a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h +++ b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h @@ -244,6 +244,7 @@ toFlatbuffer(FlatbufferObjectCache &cache, ChipDescAttr chipDesc) { chipDesc.getPcieAddressAlignBytes(), chipDesc.getNocDRAMAddressAlignBytes(), chipDesc.getL1UnreservedBase(), chipDesc.getEriscL1UnreservedBase(), chipDesc.getDramUnreservedBase(), + chipDesc.getDramUnreservedEnd(), toFlatbuffer(cache, chipDesc.getChipPhysicalCores()), toFlatbuffer(cache, chipDesc.getSupportedDataTypes()), toFlatbuffer(cache, chipDesc.getSupportedTileSizes())); diff --git a/lib/CAPI/TTAttrs.cpp b/lib/CAPI/TTAttrs.cpp index 07db90b516..e3bee6e056 100644 --- a/lib/CAPI/TTAttrs.cpp +++ b/lib/CAPI/TTAttrs.cpp @@ -38,14 +38,14 @@ MlirAttribute ttmlirTTChipDescAttrGet( unsigned nocL1AddressAlignBytes, unsigned pcieAddressAlignBytes, unsigned nocDRAMAddressAlignBytes, unsigned l1UnreservedBase, unsigned eriscL1UnreservedBase, unsigned dramUnreservedBase, - MlirAttribute chipPhysicalCores, MlirAttribute *supportedDataTypes, - MlirAttribute *supportedTileSizes) { + unsigned dramUnreservedEnd, MlirAttribute chipPhysicalCores, + MlirAttribute *supportedDataTypes, MlirAttribute *supportedTileSizes) { std::vector gridVec(grid, grid + gridSize); return wrap(ChipDescAttr::get( unwrap(ctx), mlir::dyn_cast(unwrap(arch)), gridVec, l1Size, numDramChannels, dramChannelSize, nocL1AddressAlignBytes, pcieAddressAlignBytes, nocDRAMAddressAlignBytes, l1UnreservedBase, - eriscL1UnreservedBase, dramUnreservedBase, + eriscL1UnreservedBase, dramUnreservedBase, dramUnreservedEnd, mlir::dyn_cast(unwrap(chipPhysicalCores)), mlir::dyn_cast(unwrap(*supportedDataTypes)), mlir::dyn_cast(unwrap(*supportedTileSizes)))); diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp index d760b012eb..35f48177c6 100644 --- a/lib/Dialect/TT/IR/TTOpsTypes.cpp +++ b/lib/Dialect/TT/IR/TTOpsTypes.cpp @@ -84,7 +84,7 @@ mlir::tt::SystemDescAttr::getDefault(MLIRContext *context) { { tt::ChipDescAttr::get( context, tt::ArchAttr::get(context, tt::Arch::WormholeB0), - gridShape, 1499136, 12, (1 << 30), 16, 32, 32, 0, 0, 0, + gridShape, 1499136, 12, (1 << 30), 16, 32, 32, 0, 0, 0, (1 << 30), tt::ChipPhysicalCoresAttr::get(context, workerCores, dramCores, {}, {}), supported_data_types, supported_tile_sizes), @@ -242,8 +242,8 @@ mlir::tt::SystemDescAttr::getFromPath(MLIRContext *context, std::string &path) { element->pcie_address_align_bytes(), element->noc_dram_address_align_bytes(), element->l1_unreserved_base(), element->erisc_l1_unreserved_base(), element->dram_unreserved_base(), - chip_physical_cores_attr, supported_data_types_attr, - supported_tile_sizes_attr); + element->dram_unreserved_end(), chip_physical_cores_attr, + supported_data_types_attr, supported_tile_sizes_attr); chip_desc_list.push_back(current_chip_desc_attr); } diff --git a/python/TTModule.cpp b/python/TTModule.cpp index 8ebf4c9a74..1e3841deac 100644 --- a/python/TTModule.cpp +++ b/python/TTModule.cpp @@ -124,15 +124,15 @@ void populateTTModule(py::module &m) { unsigned dramChannelSize, unsigned nocL1AddressAlignBytes, unsigned pcieAddressAlignBytes, unsigned nocDRAMAddressAlignBytes, unsigned l1UnreservedBase, unsigned eriscL1UnreservedBase, - unsigned dramUnreservedBase, MlirAttribute chipPhysicalCores, - MlirAttribute supportedDataTypes, + unsigned dramUnreservedBase, unsigned dramUnreservedEnd, + MlirAttribute chipPhysicalCores, MlirAttribute supportedDataTypes, MlirAttribute supportedTileSizes) { return wrap(tt::ChipDescAttr::get( unwrap(ctx), mlir::cast(unwrap(arch)), grid, l1Size, numDramChannels, dramChannelSize, nocL1AddressAlignBytes, pcieAddressAlignBytes, nocDRAMAddressAlignBytes, l1UnreservedBase, - eriscL1UnreservedBase, dramUnreservedBase, + eriscL1UnreservedBase, dramUnreservedBase, dramUnreservedEnd, mlir::dyn_cast( unwrap(chipPhysicalCores)), mlir::cast(unwrap(supportedDataTypes)), diff --git a/runtime/lib/common/system_desc.cpp b/runtime/lib/common/system_desc.cpp index 2e84fe8ef2..091b193397 100644 --- a/runtime/lib/common/system_desc.cpp +++ b/runtime/lib/common/system_desc.cpp @@ -148,6 +148,31 @@ createChipPhysicalCores(const ::tt::tt_metal::Device *device, fbb.CreateVectorOfStructs(eth_inactive_cores)); } +// Calculate the end of the DRAM region that is not usable by compiler. This +// upper region of memory is where kernel programs get allocated to. This +// function intends to estimate some conservative max number. +static std::uint32_t +calculateDRAMUnreservedEnd(const ::tt::tt_metal::Device *device) { + CoreCoord deviceGridSize = device->logical_grid_size(); + CoreCoord dramGridSize = device->dram_grid_size(); + std::uint32_t totalCores = deviceGridSize.x * deviceGridSize.y + + device->get_active_ethernet_cores().size(); + std::uint32_t totalDramCores = dramGridSize.x * dramGridSize.y; + std::uint32_t programCarveOutPerCore = L1_UNRESERVED_BASE; + std::uint32_t totalProgramCarveOut = programCarveOutPerCore * totalCores; + // The total carve out can be interleaved between all dram channels + std::uint32_t programCarveOutDramSpace = + (totalProgramCarveOut + totalDramCores - 1) / totalDramCores; + static_assert(DRAM_ALIGNMENT > 0); + static_assert((DRAM_ALIGNMENT & (DRAM_ALIGNMENT - 1)) == 0); + assert(programCarveOutDramSpace < device->dram_size_per_channel()); + std::uint32_t dramUnreservedEnd = + device->dram_size_per_channel() - programCarveOutDramSpace; + // Align to DRAM_ALIGNMENT + dramUnreservedEnd = dramUnreservedEnd & ~(DRAM_ALIGNMENT - 1); + return dramUnreservedEnd; +} + static std::unique_ptr<::tt::runtime::SystemDesc> getCurrentSystemDescImpl(const ::tt::tt_metal::DeviceMesh &deviceMesh) { std::vector<::tt::tt_metal::Device *> devices = deviceMesh.get_devices(); @@ -192,13 +217,15 @@ getCurrentSystemDescImpl(const ::tt::tt_metal::DeviceMesh &deviceMesh) { auto supportedTileSizes = fbb.CreateVectorOfStructs(supportedTileSizesVector); + auto dramUnreservedEnd = calculateDRAMUnreservedEnd(device); + chipDescs.push_back(::tt::target::CreateChipDesc( fbb, toFlatbuffer(device->arch()), &deviceGrid, device->l1_size_per_core(), device->num_dram_channels(), device->dram_size_per_channel(), L1_ALIGNMENT, PCIE_ALIGNMENT, DRAM_ALIGNMENT, L1_UNRESERVED_BASE, ERISC_L1_UNRESERVED_BASE, - DRAM_UNRESERVED_BASE, chipPhysicalCores, supportedDataTypes, - supportedTileSizes)); + DRAM_UNRESERVED_BASE, dramUnreservedEnd, chipPhysicalCores, + supportedDataTypes, supportedTileSizes)); chipDescIndices.push_back(device->id()); // Derive chip capability ::tt::target::ChipCapability chipCapability = From 5d60c17023ca71ba8b795b3b0a9647d332f4930b Mon Sep 17 00:00:00 2001 From: Radenko Pavlovic <133032400+rpavlovicTT@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:57:48 +0200 Subject: [PATCH 10/16] Refactoring in TTMetal dialect (#578) This commit refactors: 1. Dialect conversion from TTKernel to EmitC. 2. Serialization of TTMetal IR to flatbuffer binary. 1. Implement dialect conversion from TTKernel to EmitC TTKernel dialect that can be found nested in TTMetal ops can now be converted via 'convert-ttkernel-to-emitc' pass. Pass is registered as a func::FuncOp pass so the kernel must be put inside a function before conversion. When serializing ttmetal IR to binary, we call this conversion for every region of a ttmetal dispatch op. FileCheck UT is added. 2. Translate TTMetal to flatbuffer Serialization to flatbuffer binary is now a proper translation pass that can be run with: ttmlir-translate --ttmetal-to-flatbuffer ttmetal.mlir --- include/ttmlir/Conversion/Passes.h | 1 + include/ttmlir/Conversion/Passes.td | 6 + .../TTKernelToEmitC/TTKernelToEmitC.h | 38 ++ .../Dialect/TTMetal/Transforms/KernelsToCpp.h | 18 - .../Dialect/TTMetal/Transforms/Passes.td | 11 - .../Target/TTMetal/TTMetalToFlatbuffer.h | 19 + include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h | 4 +- lib/Conversion/CMakeLists.txt | 7 +- lib/Conversion/TTKernelToEmitC/CMakeLists.txt | 12 + .../TTKernelToEmitC/TTKernelToEmitC.cpp | 414 ++++++++++++++++++ lib/Conversion/TosaToTTIR/TosaToTTIR.cpp | 15 +- lib/Dialect/TTKernel/IR/TTKernelOps.cpp | 2 +- lib/Dialect/TTMetal/Transforms/CMakeLists.txt | 2 - .../TTMetal/Transforms/KernelsToCpp.cpp | 327 -------------- .../TTMetal/Transforms/SerializeToBinary.cpp | 295 ------------- lib/SharedLib/CMakeLists.txt | 1 + lib/Target/CMakeLists.txt | 1 + lib/Target/TTMetal/CMakeLists.txt | 17 + lib/Target/TTMetal/TTMetalToFlatbuffer.cpp | 289 ++++++++++++ .../TTMetalToFlatbufferRegistration.cpp | 29 ++ .../Conversion/TTKernelToEmitC/ttkernel.mlir | 50 +++ .../Silicon/TTMetal/simple_eltwise.mlir | 2 +- .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir | 2 +- test/ttmlir/Silicon/TTMetal/to_layout.mlir | 2 +- tools/ttmlir-translate/CMakeLists.txt | 2 +- tools/ttmlir-translate/ttmlir-translate.cpp | 7 + 26 files changed, 903 insertions(+), 670 deletions(-) create mode 100644 include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h delete mode 100644 include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h create mode 100644 include/ttmlir/Target/TTMetal/TTMetalToFlatbuffer.h create mode 100644 lib/Conversion/TTKernelToEmitC/CMakeLists.txt create mode 100644 lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp delete mode 100644 lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp delete mode 100644 lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp create mode 100644 lib/Target/TTMetal/CMakeLists.txt create mode 100644 lib/Target/TTMetal/TTMetalToFlatbuffer.cpp create mode 100644 lib/Target/TTMetal/TTMetalToFlatbufferRegistration.cpp create mode 100644 test/ttmlir/Conversion/TTKernelToEmitC/ttkernel.mlir diff --git a/include/ttmlir/Conversion/Passes.h b/include/ttmlir/Conversion/Passes.h index b12e9ebb83..7750486686 100644 --- a/include/ttmlir/Conversion/Passes.h +++ b/include/ttmlir/Conversion/Passes.h @@ -9,6 +9,7 @@ #include "ttmlir/Conversion/StableHLOToTTIR/StableHLOToTTIR.h" #endif #include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h" +#include "ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h" #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h" #include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h" #include "ttmlir/Dialect/TTIR/IR/TTIR.h" diff --git a/include/ttmlir/Conversion/Passes.td b/include/ttmlir/Conversion/Passes.td index 4925e61cc4..92926dbfdb 100644 --- a/include/ttmlir/Conversion/Passes.td +++ b/include/ttmlir/Conversion/Passes.td @@ -34,4 +34,10 @@ def ConvertTTNNToEmitC : Pass<"convert-ttnn-to-emitc", "::mlir::ModuleOp"> { let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::tt::ttnn::TTNNDialect"]; } +def ConvertTTKernelToEmitC : Pass<"convert-ttkernel-to-emitc", "::func::FuncOp"> { + let summary = "Convert TTKernel dialect to EmitC dialect."; + let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::func::FuncDialect", + "mlir::tt::ttkernel::TTKernelDialect"]; +} + #endif // TTMLIR_CONVERSION_PASSES diff --git a/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h b/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h new file mode 100644 index 0000000000..57592eb3ab --- /dev/null +++ b/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_CONVERSION_TTKERNELTOEMITC_TTKERNELTOEMITC_H +#define TTMLIR_CONVERSION_TTKERNELTOEMITC_TTKERNELTOEMITC_H + +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" + +#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h" +#include "ttmlir/Dialect/TTMetal/IR/TTMetalOps.h" +#include + +namespace mlir::tt { +#define GEN_PASS_DECL_CONVERTTTKERNELTOEMITC +#include "ttmlir/Conversion/Passes.h.inc" + +// Runs a conversion pass to EmitC dialect on a func op containing given +// region's body. Also, it adds boilerplate code such as includes and namespace +// declarations. +LogicalResult +convertTTKernelRegionToEmitC(OpBuilder &builder, Region *region, + const ttkernel::ThreadTypeAttr &threadType); + +// Converts given region to EmitC dialect and translates it to C++ code. +LogicalResult +emitDispatchOpRegionAsCpp(Region *region, std::string ®ionCpp, + const ttkernel::ThreadTypeAttr &threadType); + +// Converts dispatch op's regions to C++ code. +LogicalResult +emitDispatchOpRegionsAsCpp(ttmetal::DispatchOp dispatchOp, + llvm::SmallVector &cppStrings); + +} // namespace mlir::tt + +#endif diff --git a/include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h b/include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h deleted file mode 100644 index 18c1a9ef6e..0000000000 --- a/include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef TTMLIR_DIALECT_TTMETAL_TRANSFORMS_KERNELSTOCPP_H -#define TTMLIR_DIALECT_TTMETAL_TRANSFORMS_KERNELSTOCPP_H - -#include "mlir/Support/LogicalResult.h" - -#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h" -#include "ttmlir/Dialect/TTMetal/IR/TTMetalOps.h" - -namespace mlir::tt::ttmetal { -LogicalResult emitDispatchOpRegionAsCpp(DispatchOp dispatchOp, - unsigned regionNumber, - llvm::raw_ostream &os); -} // namespace mlir::tt::ttmetal -#endif diff --git a/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td b/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td index ee6f024084..e321db93a3 100644 --- a/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td +++ b/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td @@ -14,15 +14,4 @@ def ConvertTTIRToTTMetal: Pass<"convert-ttir-to-ttmetal", "::mlir::ModuleOp"> { }]; } -def TTMetalSerializeToBinary: Pass<"ttmetal-serialize-to-binary", "::mlir::ModuleOp"> { - let summary = ""; - let description = [{ - todo - }]; - - list