Skip to content

Commit

Permalink
Address comments last round
Browse files Browse the repository at this point in the history
Internal CBs that are not associated with tensor operands have their
address assigned locally.
  • Loading branch information
rpavlovicTT committed Nov 15, 2024
1 parent 163d087 commit d0eeefc
Show file tree
Hide file tree
Showing 11 changed files with 71 additions and 98 deletions.
4 changes: 2 additions & 2 deletions include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> {

let extraClassDeclaration = [{
unsigned getUsableL1Size() const { return getL1Size() - getL1UnreservedBase(); }
unsigned getGlobalL1RegionAddress() const;
unsigned getGlobalL1RegionSize() const;
unsigned getScratchL1RegionAddress() const;
unsigned getScratchL1RegionSize() const;
unsigned getUsableDramChannelSize() const { return getDramUnreservedEnd() - getDramUnreservedBase(); }
}];
}
Expand Down
15 changes: 11 additions & 4 deletions include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -536,13 +536,20 @@ def TTKernel_UnreachableOp : TTKernel_Op<"unreachable", [Pure, ReturnLike, Termi
}];
}

def TTKernel_MacroOp : TTKernel_Op<"macro"> {
let summary = "Macro op.";
def TTKernel_MemZerosBaseOp : TTKernel_Op<"mem_zeros_base"> {
let summary = "Op corresponding to MEM_ZEROS_BASE macro in kernels.";
let description = [{
Macro operation to mimic C++ macros
Op corresponding to MEM_ZEROS_BASE macro in kernels.
}];

let arguments = (ins FlatSymbolRefAttr:$name);
let results = (outs I32:$result);
}

def TTKernel_MemZerosSizeOp : TTKernel_Op<"mem_zeros_size"> {
let summary = "Op corresponding to MEM_ZEROS_SIZE macro in kernels.";
let description = [{
Op corresponding to MEM_ZEROS_SIZE macro in kernels.
}];

let results = (outs I32:$result);
}
Expand Down
10 changes: 4 additions & 6 deletions include/ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,22 @@ def TTKernel_CB : TTKernel_Type<"CB", "cb"> {
"uint64_t":$address,
"MemRefType":$memref,
"uint64_t":$page_size,
"uint64_t":$num_buffers,
"bool":$is_internal);
let assemblyFormat = "`<` $port`,` $address`,` $memref`,` $page_size`,` $num_buffers`,` $is_internal `>`";
"uint64_t":$num_buffers);
let assemblyFormat = "`<` $port`,` $address`,` $memref`,` $page_size`,` $num_buffers `>`";

let extraClassDeclaration = [{
static CBType get(::mlir::MLIRContext *context,
CBPort port,
uint64_t address,
MemRefType memref,
bool is_internal = false) {
MemRefType memref) {
uint64_t numBuffers = 1;
uint64_t pageSize = 0;
if (::mlir::isa<::mlir::tt::TileType>(memref.getElementType())) {
pageSize = ::mlir::cast<::mlir::tt::TileType>(memref.getElementType()).getSizeBytes();
} else {
pageSize = memref.getShape().back() * (memref.getElementType().getIntOrFloatBitWidth() / 8);
}
return CBType::get(context, port, address, memref, pageSize, numBuffers, is_internal);
return CBType::get(context, port, address, memref, pageSize, numBuffers);
}

::llvm::ArrayRef<int64_t> getShape() const {
Expand Down
16 changes: 8 additions & 8 deletions lib/Conversion/TTIRToTTMetal/TTIRToTTMetal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,11 +386,11 @@ class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::ToLayoutOp> {
Type inputCBTy = rewriter.getType<ttkernel::CBType>(
ttkernel::CBPort::In0, inputBaseAddress,
mlir::cast<MemRefType>(inputLayout.getMemref()), pageSize,
/*num_buffers*/ 1, false /* is_internal */);
/*num_buffers*/ 1);
Type outputCBTy = rewriter.getType<ttkernel::CBType>(
ttkernel::CBPort::Out0, outputBaseAddress,
mlir::cast<MemRefType>(outputLayout.getMemref()), pageSize,
/*num_buffers*/ 1, false /* is_internal */);
/*num_buffers*/ 1);
tensixBlock->addArgument(inputCBTy, op.getLoc());
tensixBlock->addArgument(outputCBTy, op.getLoc());

Expand Down Expand Up @@ -1411,8 +1411,8 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {

auto scalerCBType = ttkernel::CBType::get(
op.getContext(), scalerCBPort,
op.getSystemDesc().getChipDescs().front().getGlobalL1RegionAddress(),
tileMemref, true /* is_internal */);
op.getSystemDesc().getChipDescs().front().getScratchL1RegionAddress(),
tileMemref);
auto scalerCB = dmBlock->addArgument(scalerCBType, op.getLoc());

auto reduceKind = kernelOp.getKind();
Expand Down Expand Up @@ -1465,12 +1465,12 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
builder.create<ttkernel::CBReserveBackOp>(loc, scalerCB, oneConst);

// Prepare zero region read.
auto zerosBase = builder.create<ttkernel::MacroOp>(
loc, builder.getI32Type(), "MEM_ZEROS_BASE");
auto zerosBase =
builder.create<ttkernel::MemZerosBaseOp>(loc, builder.getI32Type());
auto zerosNocAddr =
builder.create<ttkernel::GetNocAddrOp>(loc, zerosBase->getResult(0));
auto memZerosSize = builder.create<ttkernel::MacroOp>(
loc, builder.getI32Type(), "MEM_ZEROS_SIZE");
auto memZerosSize =
builder.create<ttkernel::MemZerosSizeOp>(loc, builder.getI32Type());
builder.create<ttkernel::NocAsyncReadOnePacketSetStateOp>(loc, zerosNocAddr,
memZerosSize);

Expand Down
20 changes: 14 additions & 6 deletions lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "mlir/Target/Cpp/CppEmitter.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
#include <string>
Expand Down Expand Up @@ -302,20 +303,26 @@ class TTMetalToEmitCOpaqueRewriter : public OpConversionPattern<SourceOp> {
std::string opName;
};

class TTKernelMacroOpToEmitCOpRewriter
: public OpConversionPattern<ttkernel::MacroOp> {
template <typename Op, typename Adaptor = typename Op::Adaptor>
class TTKernelMacroOpToEmitCOpRewriter : public OpConversionPattern<Op> {
public:
TTKernelMacroOpToEmitCOpRewriter(TTKernelToEmitCTypeConverter &typeConverter,
MLIRContext *ctx)
: OpConversionPattern<ttkernel::MacroOp>(typeConverter, ctx) {}
: OpConversionPattern<Op>(typeConverter, ctx) {}

std::string getMacroName(Op op) const {
auto name = op.getOperation()->getName().getStringRef();
name = name.drop_front(9);
return name.upper();
}

LogicalResult
matchAndRewrite(ttkernel::MacroOp op, ttkernel::MacroOp::Adaptor adaptor,
matchAndRewrite(Op op, Adaptor adaptor,
ConversionPatternRewriter &rewriter) const final {

rewriter.replaceOpWithNewOp<emitc::ConstantOp>(
op, op->getResultTypes(),
emitc::OpaqueAttr::get(op->getContext(), adaptor.getName()));
emitc::OpaqueAttr::get(op->getContext(), getMacroName(op)));

return success();
}
Expand Down Expand Up @@ -376,7 +383,8 @@ class ConvertTTKernelToEmitCPass

patterns
.add<TTMetalToEmitCFuncArgsRewriter, TTMetalToEmitCReturnRewriter,
TTKernelMacroOpToEmitCOpRewriter,
TTKernelMacroOpToEmitCOpRewriter<ttkernel::MemZerosBaseOp>,
TTKernelMacroOpToEmitCOpRewriter<ttkernel::MemZerosSizeOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::BuiltinOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CopyTileInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::RecipTileInitOp>,
Expand Down
12 changes: 6 additions & 6 deletions lib/Dialect/TT/IR/TTOpsTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ using namespace mlir::tt;
#define GET_TYPEDEF_CLASSES
#include "ttmlir/Dialect/TT/IR/TTOpsTypes.cpp.inc"

unsigned mlir::tt::ChipDescAttr::getGlobalL1RegionSize() const {
// 4KB is the default size for the global L1 region.
constexpr uint32_t kGlobalL1RegionSize = 1 << 12;
return kGlobalL1RegionSize;
unsigned mlir::tt::ChipDescAttr::getScratchL1RegionSize() const {
// 4KB is the default size for the scratch L1 region.
constexpr uint32_t kScratchL1RegionSize = 1 << 12;
return kScratchL1RegionSize;
}

unsigned mlir::tt::ChipDescAttr::getGlobalL1RegionAddress() const {
return getL1Size() - getGlobalL1RegionSize();
unsigned mlir::tt::ChipDescAttr::getScratchL1RegionAddress() const {
return getL1Size() - getScratchL1RegionSize();
}

mlir::tt::SystemDescAttr
Expand Down
2 changes: 1 addition & 1 deletion lib/Dialect/TTIR/Transforms/Allocate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
memorySpaceInfo[ttmlir::utils::enum_as_int(MemorySpace::DeviceL1)] =
SimpleAllocator::MemorySpaceInfo(chipDesc.getL1UnreservedBase(),
chipDesc.getL1Size() -
chipDesc.getGlobalL1RegionSize(),
chipDesc.getScratchL1RegionSize(),
chipDesc.getNocL1AddressAlignBytes());
memorySpaceInfo[ttmlir::utils::enum_as_int(MemorySpace::DeviceDRAM)] =
SimpleAllocator::MemorySpaceInfo(
Expand Down
3 changes: 2 additions & 1 deletion lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,8 @@ static std::shared_ptr<void> translateModuleToFlatbuffer(
for (auto arg : region.getArguments()) {
auto cbType = mlir::cast<ttkernel::CBType>(arg.getType());
auto cbDesc = cache.getOrCreate(cbType, cbTypeToFlatbuffer);
auto tensorRef = cbType.getIsInternal() ? 0 : operands[argNumber++];
auto tensorRef =
argNumber >= operands.size() ? 0 : operands[argNumber++];
cbs.push_back(
::tt::target::CreateCBRef(fbb, cache.global_id++, tensorRef,
cbType.getAddress(), cbDesc));
Expand Down
1 change: 1 addition & 0 deletions runtime/include/tt/runtime/detail/ttmetal.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#pragma clang diagnostic ignored "-Wzero-length-array"
#define FMT_HEADER_ONLY
#include "distributed/mesh_device.hpp"
#include "impl/buffers/circular_buffer.hpp"
#include "impl/event/event.hpp"
#include "tt_metal/host_api.hpp"
#pragma clang diagnostic pop
Expand Down
68 changes: 13 additions & 55 deletions runtime/lib/ttmetal/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,6 @@ struct CQExecutor {
CQExecutor(::tt::tt_metal::Device *device, std::size_t cq_id,
std::vector<InputBuffer> const &inputs,
std::vector<OutputBuffer> const &outputs);
~CQExecutor() {
::tt::tt_metal::DeallocateBuffer(*buffers.at(kReservedBufferId));
buffers.erase(kReservedBufferId);
}

constexpr static uint32_t kReservedBufferId = 1 << 20;
void createReservedL1Buffer();

std::shared_ptr<::tt::tt_metal::Event>
execute(::tt::target::metal::CommandQueue const *commandQueue);
Expand Down Expand Up @@ -74,47 +67,6 @@ CQExecutor::CQExecutor(::tt::tt_metal::Device *device, std::size_t cq_id,
}

cq = &device->command_queue(cq_id);

createReservedL1Buffer();
}

void CQExecutor::createReservedL1Buffer() {
// Ideally size of the reserved space should be part of system desc.
constexpr uint32_t dimSize = 32;
constexpr uint32_t sizeKB = dimSize * dimSize * 4;
const uint32_t address = device->l1_size_per_core() - sizeKB;

CoreCoord cores = device->logical_grid_size();

std::array<uint32_t, 2> shape = {dimSize, dimSize};
ShardSpec shardSpec(
CoreRangeSet(
{CoreRange(CoreCoord(0, 0), CoreCoord(cores.x - 1, cores.y - 1))}),
shape);

array<uint32_t, 2> tensorShape = {static_cast<uint32_t>(cores.y),
static_cast<uint32_t>(cores.x)};
ShardSpecBuffer shardSpecBuffer(shardSpec, {dimSize, dimSize}, tensorShape);

tt::target::DataType dataType = tt::target::DataType::Float16;
uint64_t itemSize = ::tt::runtime::utils::dataTypeElementSize(dataType);
array<uint32_t, 2> pageShape = {dimSize, dimSize};
uint64_t pageSize = pageShape[0] * pageShape[1] * itemSize;
uint64_t size = pageSize * cores.x * cores.y;
ShardedBufferConfig shardConfig = {.device = device,
.size = size,
.page_size = pageSize,
.buffer_type = BufferType::L1,
.buffer_layout =
TensorMemoryLayout::BLOCK_SHARDED,
.shard_parameters = shardSpecBuffer,
.allocate = false};

std::shared_ptr<::tt::tt_metal::Buffer> buffer =
::tt::tt_metal::CreateBuffer(shardConfig);
buffer->set_address(address);

buffers[kReservedBufferId] = buffer;
}

std::shared_ptr<::tt::tt_metal::Event>
Expand Down Expand Up @@ -420,14 +372,8 @@ static ::tt::tt_metal::CircularBufferConfig createCircularBufferConfig(
toDataFormat(cbRef->desc()->memory_desc()->data_type());

if (!cbRef->tensor_ref()) {
assert(cbRef->address() >=
buffers.at(CQExecutor::kReservedBufferId)->address());
// TODO specifying address to CB must be done through shadow buffer.
// However, in this case we're seeing weird behavior w.r.t. to results when
// using shadow buffer. This needs to be investigated further.
return CircularBufferConfig(totalSize,
{{cbRef->desc()->port(), dataFormat}})
// *buffers.at(CQExecutor::kReservedBufferId))
.set_page_size(cbRef->desc()->port(), cbRef->desc()->page_size());
}

Expand Down Expand Up @@ -521,7 +467,19 @@ void CQExecutor::execute(
}
::tt::tt_metal::CircularBufferConfig config =
createCircularBufferConfig(cbRef, buffers);
::tt::tt_metal::CreateCircularBuffer(program, coreRangeSet, config);
CBHandle cbHandle =
::tt::tt_metal::CreateCircularBuffer(program, coreRangeSet, config);

if (!cbRef->tensor_ref()) {
// Internally allocated CBs are not associated with any tensor ref. We
// need to set the address of the CB manually.
std::shared_ptr<CircularBuffer> cbPtr =
tt_metal::detail::GetCircularBuffer(program, cbHandle);
assert(!cbPtr->globally_allocated() &&
"CB should not be globally allocated");
cbPtr->set_locally_allocated_address(cbRef->address());
}

createdCBs.insert(cbRef->desc()->port());
}

Expand Down
18 changes: 9 additions & 9 deletions test/ttmlir/Conversion/TTKernelToEmitC/ttkernel.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,23 @@ module attributes {} {
"ttkernel.return"() : () -> ()
}

func.func @ttkernel_tensix(%arg1: !ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1, false>,
%arg2: !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1, false>) -> () {
func.func @ttkernel_tensix(%arg1: !ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>,
%arg2: !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> () {
%c4_i32 = arith.constant 4 : i32
// CHECK: emitc.call_opaque "untilize_init"[[C:.*]]
"ttkernel.untilize_init"(%arg1, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1, false>, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1, false>) -> ()
"ttkernel.untilize_init"(%arg1, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> ()
// CHECK: emitc.call_opaque "untilize_block"[[C:.*]]
"ttkernel.untilize_block"(%arg1, %c4_i32, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1, false>, i32, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1, false>) -> ()
"ttkernel.untilize_block"(%arg1, %c4_i32, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> ()
// CHECK: emitc.call_opaque "cb_pop_front"[[C:.*]]
"ttkernel.cb_pop_front"(%arg1, %c4_i32) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1, false>, i32) -> ()
"ttkernel.cb_pop_front"(%arg1, %c4_i32) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32) -> ()
// CHECK: emitc.call_opaque "cb_push_back"[[C:.*]]
"ttkernel.cb_push_back"(%arg2, %c4_i32) : (!ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1, false>, i32) -> ()
"ttkernel.cb_push_back"(%arg2, %c4_i32) : (!ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>, i32) -> ()
// CHECK: emitc.call_opaque "untilize_block"[[C:.*]]
"ttkernel.untilize_block"(%arg1, %c4_i32, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1, false>, i32, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1, false>) -> ()
"ttkernel.untilize_block"(%arg1, %c4_i32, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> ()
// CHECK: emitc.call_opaque "cb_pop_front"[[C:.*]]
"ttkernel.cb_pop_front"(%arg1, %c4_i32) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1, false>, i32) -> ()
"ttkernel.cb_pop_front"(%arg1, %c4_i32) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32) -> ()
// CHECK: emitc.call_opaque "cb_push_back"[[C:.*]]
"ttkernel.cb_push_back"(%arg2, %c4_i32) : (!ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1, false>, i32) -> ()
"ttkernel.cb_push_back"(%arg2, %c4_i32) : (!ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>, i32) -> ()
// CHECK: return
"ttkernel.return"() : () -> ()
}
Expand Down

0 comments on commit d0eeefc

Please sign in to comment.