Skip to content

Commit

Permalink
TPP Runner Wrapper pass (#905)
Browse files Browse the repository at this point in the history
Refactor MLIR Bench infrastructure into a pass.

The core logic is already covered by tpp-run tests.
  • Loading branch information
adam-smnk authored Apr 29, 2024
1 parent 3c26c4c commit 38807f2
Show file tree
Hide file tree
Showing 10 changed files with 338 additions and 139 deletions.
50 changes: 50 additions & 0 deletions include/TPP/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -542,4 +542,54 @@ def ConvertAddInplacePass: Pass<"linalg-convert-add-in-place",
let dependentDialects = ["linalg::LinalgDialect"];
}

def TppRunnerWrapper : Pass<"tpp-runner-wrapper", "ModuleOp">{
let summary = "Create main function runner wrapper";
let description = [{
Creates a runner wrapper - maps the arguments and random initialize them.
Optionally, inserts benchmark wrapper calling the main kernel repeatedly
and taking measurements, or printing the result in the end.
}];
let dependentDialects = ["func::FuncDialect",
"tensor::TensorDialect",
"memref::MemRefDialect",
"gpu::GPUDialect",
"arith::ArithDialect",
"scf::SCFDialect",
"vector::VectorDialect",
"bufferization::BufferizationDialect",
"perf::PerfDialect"];
let options = [
Option<"kernelName", "kernel-name", "std::string",
/*default=*/"\"entry\"",
"The kernel function to be called.">,
Option<"kernelType", "kernel-type", "std::string",
/*default=*/"\"void\"",
"The type of the kernel function.">,
Option<"backend", "backend", "std::string",
/*default=*/"\"cpu\"",
"Kernel target device backend (cpu, cuda, vulkan).">,
Option<"offloadToDevice", "offload-on-device", "bool",
/*default=*/"true",
"Offload kernel arguments to the target device.">,
Option<"numBenchLoops", "bench-loops", "int64_t",
/*default=*/"1",
"Number of benchmarking loops.">,
Option<"benchWarmup", "bench-warmup", "bool",
/*default=*/"true",
"Add benchmark warmup loops.">,
Option<"printResult", "print", "bool",
/*default=*/"false",
"Print kernel results.">,
Option<"randomSplat", "random-splat", "bool",
/*default=*/"false",
"Replace splat dense tensors with random values.">,
Option<"seed", "seed", "int64_t",
/*default=*/"0",
"Initialization random seed.">,
Option<"initType", "init-type", "std::string",
/*default=*/"",
"Initializer type (const, simple, cont, rand, normal).">,
];
}

#endif // TPP_DIALECT_TPP_PASSES
44 changes: 26 additions & 18 deletions tools/tpp-run/MLIRBench.h → include/TPP/Runner/MLIRBench.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
#ifndef TPP_RUN_MLIRBENCH_H
#define TPP_RUN_MLIRBENCH_H

//===- MLIRBench.h - MLIR Benchmark Producer ------------------------------===//
//===- MLIRBench.h - MLIR Benchmark Producer ---------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Producer for benchmark wrapper methods. Upon selecting a kernel to run, maps
// the arguments, random initialize them and call the kernel as many times as
// requested, taking measurements and printing the result in the end.
//
//===----------------------------------------------------------------------===//

#ifndef TPP_RUNNER_MLIRBENCH_H
#define TPP_RUNNER_MLIRBENCH_H

#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/IR/Builders.h"
Expand All @@ -25,6 +31,7 @@ class ModuleOp;
class MemRefType;
class Operation;
class Value;

namespace func {
class FuncOp;
} // namespace func
Expand All @@ -33,11 +40,15 @@ class FuncOp;
// pipeline.
struct MLIRBenchConfig {
MLIRBenchConfig() = default;
MLIRBenchConfig(int seed, TensorInitType initType)
: seed(seed), initType(initType) {}
MLIRBenchConfig(int seed, TensorInitType initType, std::string backend,
bool offloadToDevice)
: seed(seed), initType(initType), backend(backend),
offloadToDevice(offloadToDevice) {}

int seed = 0;
TensorInitType initType = TensorInitType::Auto;
std::string backend = "cpu";
bool offloadToDevice = true;
};

/// MLIRBench - Creates wrapper for calling kernel methods.
Expand All @@ -47,15 +58,6 @@ struct MLIRBenchConfig {
/// inteface is a bit weird, but it will get better once we clear the
/// API design, with time.
class MLIRBench {
/// Min number of warmup loops
static unsigned constexpr minIters = 1;

/// Max number of warmup loops
static unsigned constexpr maxIters = 100;

/// Target ratio of warmup loops: ( total iterations / warmupRatio )
static unsigned constexpr warmupRatio = 10;

/// MLIR OpBulder
OpBuilder builder;

Expand Down Expand Up @@ -86,6 +88,12 @@ class MLIRBench {
/// Tensor init type
TensorInitType initType;

/// Target device backend
std::string backend;

/// Allocate arguments on target device
bool offloadToDevice;

/// Gets module's main block
Block &getModuleBlock();

Expand Down Expand Up @@ -143,8 +151,8 @@ class MLIRBench {
/// Prints the result of a kernel call
LogicalResult printResult(Operation *kernelCall);

/// Terminates the function, issuing a return, lower to LLVM
LogicalResult finalize();
/// Terminates the function, issuing a return.
LogicalResult terminate();

/// Reports error on the current module's location
LogicalResult emitError(llvm::Twine);
Expand All @@ -155,4 +163,4 @@ class MLIRBench {

} // namespace mlir

#endif
#endif // TPP_RUNNER_MLIRBENCH_H
1 change: 1 addition & 0 deletions lib/TPP/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ add_subdirectory(Dialect)
add_subdirectory(Conversion)
add_subdirectory(IR)
add_subdirectory(GPU)
add_subdirectory(Runner)
add_subdirectory(Transforms)

get_property(mlir_dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
Expand Down
14 changes: 14 additions & 0 deletions lib/TPP/Runner/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
add_mlir_library(TPPRunner
MLIRBench.cpp
TppRunnerWrapper.cpp

ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/include/TPP

DEPENDS
${mlir_dialect_libs}
MLIRIR
MLIRPass
TPPPerfDialect
TPPTransformsUtils
)
55 changes: 15 additions & 40 deletions tools/tpp-run/MLIRBench.cpp → lib/TPP/Runner/MLIRBench.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
//===- MLIRBench.cpp - MLIR Benchmark Producer ----------------------------===//
//===- MLIRBench.cpp - MLIR Benchmark Producer -----------------*----C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Producer for benchmark wrapper methods. Upon selecting a Kernel to run, maps
// the arguments, random initialize them and call the Kernel as many times as
// requested, taking measurements and printing the result in the end.
//
//===----------------------------------------------------------------------===//

#include "MLIRBench.h"
#include "TPP/Runner/MLIRBench.h"

#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
Expand Down Expand Up @@ -47,33 +53,15 @@

using namespace mlir;

// Select target GPU backend for the pipeline.
llvm::cl::opt<std::string>
defGpuBackend("gpu", llvm::cl::desc("Target GPU backend for lowering"),
llvm::cl::value_desc("cuda,vulkan"), llvm::cl::init(""));

// Kernel buffers - arguments and return values - are expected to be allocated
// on GPU.
llvm::cl::opt<bool>
defGpuArgs("gpu-args",
llvm::cl::desc("Kernel buffers are allocated on GPU"),
llvm::cl::init(true));

MLIRBench::MLIRBench(mlir::Operation *op, const MLIRBenchConfig &config)
: builder(op->getContext()), unkLoc(builder.getUnknownLoc()) {
seed = config.seed;
backend = config.backend;
initType = config.initType;
offloadToDevice = config.offloadToDevice;

module = dyn_cast<ModuleOp>(op);
assert(module && "expected a 'builtin.Module' op");
auto *ctx = module->getContext();
ctx->getOrLoadDialect<tensor::TensorDialect>();
ctx->getOrLoadDialect<vector::VectorDialect>();
ctx->getOrLoadDialect<scf::SCFDialect>();
ctx->getOrLoadDialect<math::MathDialect>();
ctx->getOrLoadDialect<bufferization::BufferizationDialect>();
ctx->getOrLoadDialect<perf::PerfDialect>();
ctx->getOrLoadDialect<gpu::GPUDialect>();
}

LogicalResult MLIRBench::findKernel(StringRef name) {
Expand Down Expand Up @@ -187,10 +175,10 @@ LogicalResult MLIRBench::renameKernel() {

Value MLIRBench::registerOnGpu(Value buf, MemRefType memRefTy) {
// Do nothing when not using GPU
if (defGpuBackend.empty() || !defGpuArgs)
if (!offloadToDevice || !(backend == "cuda" || backend == "vulkan"))
return buf;

if (defGpuBackend == "vulkan") {
if (backend == "vulkan") {
// Copy to heap as global memory is not shared between host and device
auto localBuf = builder.create<memref::AllocOp>(unkLoc, memRefTy);
auto copy = builder.create<memref::CopyOp>(unkLoc, buf, localBuf);
Expand Down Expand Up @@ -396,7 +384,7 @@ LogicalResult MLIRBench::printResult(Operation *kernelCall) {

// Kernels must return a single result
Value result = kernelCall->getResult(0);
if (defGpuBackend == "cuda" && defGpuArgs) {
if (backend == "cuda" && offloadToDevice) {
auto resType = cast<ShapedType>(result.getType());
auto memrefType =
MemRefType::get(resType.getShape(), resType.getElementType());
Expand Down Expand Up @@ -424,7 +412,7 @@ LogicalResult MLIRBench::printResult(Operation *kernelCall) {
return printShapedType(result);
}

LogicalResult MLIRBench::finalize() {
LogicalResult MLIRBench::terminate() {
// If we created a main at all...
// return void and add func to Module
if (main) {
Expand All @@ -433,19 +421,6 @@ LogicalResult MLIRBench::finalize() {
builder.create<func::ReturnOp>(unkLoc);
}

// A set of default passes that lower any input IR to LLVM
PassManager passManager(module->getContext());

tpp::DefaultPipelineOptions options{defGpuBackend};
passManager.addPass(tpp::createDefaultPipeline(options));

auto result = passManager.run(module);
if (failed(result)) {
llvm::errs() << "ERROR: Failed to lower IR to LLVM dialect\n";
module->print(llvm::errs());
return result;
}

return success();
}

Expand All @@ -461,4 +436,4 @@ LogicalResult MLIRBench::emitError(llvm::Twine desc) {
return module.emitError(desc);
}

std::string MLIRBench::getGPUName() { return defGpuBackend; }
std::string MLIRBench::getGPUName() { return backend; }
Loading

0 comments on commit 38807f2

Please sign in to comment.