diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h index f6e9baece..7392e2c8b 100644 --- a/include/imex/Transforms/Passes.h +++ b/include/imex/Transforms/Passes.h @@ -41,6 +41,7 @@ createOptimizeTransposePass(const std::string &device = "pvc"); std::unique_ptr createHoistTransposePass(); std::unique_ptr createVnniTransformationPass(); std::unique_ptr createEmulateNonNativeBF16Pass(); +std::unique_ptr createTileLoopsPass(); #define GEN_PASS_DECL #include "imex/Transforms/Passes.h.inc" diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td index 67630726c..ac9220a36 100644 --- a/include/imex/Transforms/Passes.td +++ b/include/imex/Transforms/Passes.td @@ -210,4 +210,28 @@ def HoistTranspose : Pass<"imex-xegpu-hoist-transpose"> { ]; } +def TileLoops : Pass<"tile-loops", "::mlir::func::FuncOp"> { + let summary = "Tile linalg.generic loops for GPU offloading"; + let description = [{ + Tiles loops defined with tensor inputs/outputs using the given tile sizes. + This pass should be applied after loop fusion and before bufferization. + Uses `TileUsingSCF` method. To map the loop to GPU blocks and threads this + pass should be called twice. If `in-regions` is set, only loops within GPU + regions are tiled. + }]; + let options = [ + ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes">, + Option<"minTileFactor", "min-tile-factor", "int64_t", "2", + "Minimum factor between dimension size and a tile size">, + Option<"inRegions", "in-regions", "bool", "false", + "Convert loops only within GPU regions"> + ]; + let constructor = "imex::createTileLoopsPass()"; + let dependentDialects = [ + "::mlir::linalg::LinalgDialect", + "::mlir::scf::SCFDialect" + ]; +} + + #endif // _IMEX_TRANSFORMS_PASSES_TD_INCLUDED_ diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt index b1b2a6bf3..071397eca 100644 --- a/lib/Transforms/CMakeLists.txt +++ b/lib/Transforms/CMakeLists.txt @@ -15,6 +15,7 @@ add_mlir_library(IMEXTransforms VnniTransformation.cpp OptimizeTranspose.cpp HoistTranspose.cpp + TileLoops.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/imex/Transforms diff --git a/lib/Transforms/TileLoops.cpp b/lib/Transforms/TileLoops.cpp new file mode 100644 index 000000000..edd358662 --- /dev/null +++ b/lib/Transforms/TileLoops.cpp @@ -0,0 +1,141 @@ +//===- TileLoops.cpp ------------------------------------*- C++ -*-===// +// +// Copyright 2023 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the TileLoops transform which tiles loops for GPU +/// mapping. +/// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "llvm/Support/Threading.h" +#include +#include + +namespace imex { +#define GEN_PASS_DEF_TILELOOPS +#include "imex/Transforms/Passes.h.inc" +} // namespace imex + +#define DEBUG_TYPE "tile-loops" + +#ifndef NDEBUG +#define DEBUG_MSG(PREFIX, MSG) \ + LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << "\n"); +#define DEBUG_OP(PREFIX, MSG, OP) \ + LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << " '" << OP->getName() \ + << "' " << OP->getLoc() << "\n"); +#define DEBUG_OP_VEC(PREFIX, MSG, OPVEC) \ + LLVM_DEBUG(llvm::dbgs() << PREFIX << ": " << MSG << " (" << OPVEC.size() \ + << ")\n"); \ + for (auto op : OPVEC) { \ + DEBUG_OP(PREFIX, " ", op) \ + } +#endif + +using namespace imex; + +namespace { + +static ::mlir::FailureOr<::mlir::SmallVector> +getDefaultTileSizes(::mlir::linalg::LinalgOp linalgOp, + ::mlir::ArrayRef userProvidedTiles) { + // The user-provided tiles are considered from the outer + // most loop. If not enough tiles are provided we pad with + // zeros. + if (!userProvidedTiles.empty()) { + size_t numParallelLoops = linalgOp.getNumParallelLoops(); + size_t nonZeros = 0; + for (auto tile : userProvidedTiles) + if (tile != 0) + nonZeros++; + if (nonZeros > numParallelLoops || + userProvidedTiles.size() > linalgOp.getNumLoops()) { + return ::mlir::failure(); + } + + ::mlir::SmallVector userTiles(linalgOp.getNumLoops(), 0); + for (auto tile : ::llvm::enumerate(userProvidedTiles)) + userTiles[tile.index()] = tile.value(); + return userTiles; + } + return ::mlir::failure(); +} + +struct TileLoops final : public imex::impl::TileLoopsBase { + + using TileLoopsBase::TileLoopsBase; + + void runOnOperation() override { + + ::mlir::func::FuncOp func = getOperation(); + ::mlir::IRRewriter rewriter(&getContext()); + transform(rewriter, func, this->tileSizes, this->minTileFactor); + + return; + } + +private: + void transform(::mlir::RewriterBase &rewriter, ::mlir::func::FuncOp func, + ::mlir::ArrayRef tileSizes, int64_t minTileFactor) { + DEBUG_MSG("tile-loops", "Entering transform"); + ::mlir::SmallVector<::mlir::Operation *> allLinalgOps; + func->walk([&](::mlir::linalg::LinalgOp linalgOp) { + if (!inRegions || ::imex::region::isInGpuRegion(linalgOp)) { + allLinalgOps.push_back(linalgOp); + } + }); + DEBUG_OP_VEC("tile-loops", " Found linalg ops", allLinalgOps); + + for (auto op : allLinalgOps) { + DEBUG_OP("tile-loops", " Tiling op:", op); + auto tiles = getDefaultTileSizes( + ::llvm::cast<::mlir::linalg::LinalgOp>(op), tileSizes); + if (failed(tiles)) { + DEBUG_MSG("tile-loops", + " Failed to compute default tile sizes. Aborting."); + return; + } + DEBUG_MSG("tile-loops", " tile sizes:"); + LLVM_DEBUG(llvm::dbgs() << "tile-loops: ("); + LLVM_DEBUG(llvm::interleaveComma(*tiles, llvm::dbgs())); + LLVM_DEBUG(llvm::dbgs() << ")\n"); + + auto tilesRes = + ::mlir::getAsOpFoldResult(rewriter.getI64ArrayAttr(*tiles)); + ::mlir::scf::SCFTilingOptions options; + options.setTileSizes(tilesRes); + options.setLoopType(::mlir::scf::SCFTilingOptions::LoopType::ForallOp); + auto tileOp = ::mlir::cast<::mlir::TilingInterface>(op); + ::mlir::FailureOr<::mlir::scf::SCFTilingResult> tilingResult = + mlir::scf::tileUsingSCF(rewriter, tileOp, options); + if (failed(tilingResult)) { + DEBUG_MSG("tile-loops", " Failed to tile op. Aborting."); + return; + } + DEBUG_MSG("tile-loops", " Tiling applied successfully."); + rewriter.replaceOp(op, tilingResult.value().replacements); + } + } +}; + +} // end anonymous namespace + +namespace imex { +std::unique_ptr createTileLoopsPass() { + return std::make_unique(); +} +} // namespace imex diff --git a/test/Transforms/tile-loops.mlir b/test/Transforms/tile-loops.mlir new file mode 100644 index 000000000..098999f1a --- /dev/null +++ b/test/Transforms/tile-loops.mlir @@ -0,0 +1,38 @@ +// RUN: imex-opt --split-input-file -tile-loops='tile-sizes=32' -tile-loops='tile-sizes=1' %s -verify-diagnostics -o -| FileCheck %s + +#map = affine_map<(d0) -> (d0)> +module { + func.func @add(%arg0: tensor<129xf32>, %arg1: tensor<129xf32>, %arg2: tensor<129xf32>) -> tensor<129xf32> { + %0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<129xf32>, tensor<129xf32>) outs(%arg2 : tensor<129xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %1 = arith.addf %in, %in_0 : f32 + linalg.yield %1 : f32 + } -> tensor<129xf32> + return %0 : tensor<129xf32> + } +} +// CHECK-LABEL: func.func @add +// CHECK-NEXT: %[[FORALL:.*]] = scf.forall (%arg3) = (0) to (129) step (32) shared_outs(%arg4 = %arg2) -> (tensor<129xf32>) { +// CHECK-NEXT: %[[C129:.*]] = arith.constant 129 : index +// CHECK-NEXT: %[[MIN:.*]] = affine.min #map(%[[ARG3:.*]]) +// CHECK-NEXT: %[[APPLY1:.*]] = affine.apply #map1(%[[MIN]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %arg0[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor +// CHECK-NEXT: %[[EXTRACTED_SLICE_0:.*]] = tensor.extract_slice %arg1[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor +// CHECK-NEXT: %[[EXTRACTED_SLICE_1:.*]] = tensor.extract_slice %arg4[%[[ARG3]]] [%[[MIN]]] [1] : tensor<129xf32> to tensor +// CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[FORALL:.*]] = scf.forall (%[[ARG5:.*]]) in (%[[MIN]]) shared_outs(%[[ARG6:.*]] = %[[EXTRACTED_SLICE_1]]) -> (tensor) { +// CHECK-NEXT: %[[EXTRACTED_SLICE_4:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG5]]] [1] [1] : tensor to tensor<1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_0]][%[[ARG5]]] [1] [1] : tensor to tensor<1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_6:.*]] = tensor.extract_slice %[[ARG6]][%[[ARG5]]] [1] [1] : tensor to tensor<1xf32> +// CHECK-NEXT: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%[[EXTRACTED_SLICE_4]], %[[EXTRACTED_SLICE_5]] : tensor<1xf32>, tensor<1xf32>) outs(%[[EXTRACTED_SLICE_6]] : tensor<1xf32>) { +// CHECK-NEXT: ^bb0(%[[IN:.*]]: f32, %[[IN_7:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK-NEXT: %[[ADDF:.*]] = arith.addf %[[IN]], %[[IN_7]] : f32 +// CHECK-NEXT: linalg.yield %[[ADDF]] : f32 +// CHECK-NEXT: } -> tensor<1xf32> +// CHECK-NEXT: scf.forall.in_parallel { +// CHECK-NEXT: tensor.parallel_insert_slice %[[GENERIC]] into %[[ARG6]][%[[ARG5]]] [1] [1] : tensor<1xf32> into tensor +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK: scf.forall.in_parallel { +// CHECK-NEXT: tensor.parallel_insert_slice %[[FORALL]] into %arg4[%[[ARG3]]] [%[[MIN]]] [1] : tensor into tensor<129xf32> +// CHECK-NEXT: }