TPP Runner Wrapper pass (#905)

Refactor MLIR Bench infrastructure into a pass. The core logic is already covered by tpp-run tests.
libxsmm · Apr 29, 2024 · 38807f2 · 38807f2
1 parent 3c26c4c
commit 38807f2
Show file tree

Hide file tree

Showing 10 changed files with 338 additions and 139 deletions.
diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td
@@ -542,4 +542,54 @@ def ConvertAddInplacePass: Pass<"linalg-convert-add-in-place",
   let dependentDialects = ["linalg::LinalgDialect"];
 }
 
+def TppRunnerWrapper : Pass<"tpp-runner-wrapper", "ModuleOp">{
+  let summary = "Create main function runner wrapper";
+  let description = [{
+    Creates a runner wrapper - maps the arguments and random initialize them.
+    Optionally, inserts benchmark wrapper calling the main kernel repeatedly
+    and taking measurements, or printing the result in the end.
+  }];
+  let dependentDialects = ["func::FuncDialect",
+                           "tensor::TensorDialect",
+                           "memref::MemRefDialect",
+                           "gpu::GPUDialect",
+                           "arith::ArithDialect",
+                           "scf::SCFDialect",
+                           "vector::VectorDialect",
+                           "bufferization::BufferizationDialect",
+                           "perf::PerfDialect"];
+  let options = [
+    Option<"kernelName", "kernel-name", "std::string",
+            /*default=*/"\"entry\"",
+           "The kernel function to be called.">,
+    Option<"kernelType", "kernel-type", "std::string",
+            /*default=*/"\"void\"",
+           "The type of the kernel function.">,
+    Option<"backend", "backend", "std::string",
+            /*default=*/"\"cpu\"",
+           "Kernel target device backend (cpu, cuda, vulkan).">,
+    Option<"offloadToDevice", "offload-on-device", "bool",
+            /*default=*/"true",
+           "Offload kernel arguments to the target device.">,
+    Option<"numBenchLoops", "bench-loops", "int64_t",
+            /*default=*/"1",
+           "Number of benchmarking loops.">,
+    Option<"benchWarmup", "bench-warmup", "bool",
+            /*default=*/"true",
+           "Add benchmark warmup loops.">,
+    Option<"printResult", "print", "bool",
+            /*default=*/"false",
+           "Print kernel results.">,
+    Option<"randomSplat", "random-splat", "bool",
+            /*default=*/"false",
+           "Replace splat dense tensors with random values.">,
+    Option<"seed", "seed", "int64_t",
+            /*default=*/"0",
+           "Initialization random seed.">,
+    Option<"initType", "init-type", "std::string",
+            /*default=*/"",
+           "Initializer type (const, simple, cont, rand, normal).">,
+  ];
+}
+
 #endif // TPP_DIALECT_TPP_PASSES
diff --git a/tools/tpp-run/MLIRBench.h → include/TPP/Runner/MLIRBench.h b/tools/tpp-run/MLIRBench.h → include/TPP/Runner/MLIRBench.h
@@ -1,14 +1,20 @@
-#ifndef TPP_RUN_MLIRBENCH_H
-#define TPP_RUN_MLIRBENCH_H
-
-//===- MLIRBench.h - MLIR Benchmark Producer ------------------------------===//
+//===- MLIRBench.h - MLIR Benchmark Producer ---------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 //
 // Producer for benchmark wrapper methods. Upon selecting a kernel to run, maps
 // the arguments, random initialize them and call the kernel as many times as
 // requested, taking measurements and printing the result in the end.
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef TPP_RUNNER_MLIRBENCH_H
+#define TPP_RUNNER_MLIRBENCH_H
+
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
@@ -25,6 +31,7 @@ class ModuleOp;
 class MemRefType;
 class Operation;
 class Value;
+
 namespace func {
 class FuncOp;
 } // namespace func
@@ -33,11 +40,15 @@ class FuncOp;
 // pipeline.
 struct MLIRBenchConfig {
   MLIRBenchConfig() = default;
-  MLIRBenchConfig(int seed, TensorInitType initType)
-      : seed(seed), initType(initType) {}
+  MLIRBenchConfig(int seed, TensorInitType initType, std::string backend,
+                  bool offloadToDevice)
+      : seed(seed), initType(initType), backend(backend),
+        offloadToDevice(offloadToDevice) {}
 
   int seed = 0;
   TensorInitType initType = TensorInitType::Auto;
+  std::string backend = "cpu";
+  bool offloadToDevice = true;
 };
 
 /// MLIRBench - Creates wrapper for calling kernel methods.
@@ -47,15 +58,6 @@ struct MLIRBenchConfig {
 /// inteface is a bit weird, but it will get better once we clear the
 /// API design, with time.
 class MLIRBench {
-  /// Min number of warmup loops
-  static unsigned constexpr minIters = 1;
-
-  /// Max number of warmup loops
-  static unsigned constexpr maxIters = 100;
-
-  /// Target ratio of warmup loops: ( total iterations / warmupRatio )
-  static unsigned constexpr warmupRatio = 10;
-
   /// MLIR OpBulder
   OpBuilder builder;
 
@@ -86,6 +88,12 @@ class MLIRBench {
   /// Tensor init type
   TensorInitType initType;
 
+  /// Target device backend
+  std::string backend;
+
+  /// Allocate arguments on target device
+  bool offloadToDevice;
+
   /// Gets module's main block
   Block &getModuleBlock();
 
@@ -143,8 +151,8 @@ class MLIRBench {
   /// Prints the result of a kernel call
   LogicalResult printResult(Operation *kernelCall);
 
-  /// Terminates the function, issuing a return, lower to LLVM
-  LogicalResult finalize();
+  /// Terminates the function, issuing a return.
+  LogicalResult terminate();
 
   /// Reports error on the current module's location
   LogicalResult emitError(llvm::Twine);
@@ -155,4 +163,4 @@ class MLIRBench {
 
 } // namespace mlir
 
-#endif
+#endif // TPP_RUNNER_MLIRBENCH_H
diff --git a/lib/TPP/CMakeLists.txt b/lib/TPP/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(Dialect)
 add_subdirectory(Conversion)
 add_subdirectory(IR)
 add_subdirectory(GPU)
+add_subdirectory(Runner)
 add_subdirectory(Transforms)
 
 get_property(mlir_dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)

diff --git a/lib/TPP/Runner/CMakeLists.txt b/lib/TPP/Runner/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_library(TPPRunner
+  MLIRBench.cpp
+  TppRunnerWrapper.cpp
+
+  ADDITIONAL_HEADER_DIRS
+    ${PROJECT_SOURCE_DIR}/include/TPP
+
+  DEPENDS
+    ${mlir_dialect_libs}
+    MLIRIR
+    MLIRPass
+    TPPPerfDialect
+    TPPTransformsUtils
+)
diff --git a/tools/tpp-run/MLIRBench.cpp → lib/TPP/Runner/MLIRBench.cpp b/tools/tpp-run/MLIRBench.cpp → lib/TPP/Runner/MLIRBench.cpp
@@ -1,12 +1,18 @@
-//===- MLIRBench.cpp - MLIR Benchmark Producer ----------------------------===//
+//===- MLIRBench.cpp - MLIR Benchmark Producer -----------------*----C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 //
 // Producer for benchmark wrapper methods. Upon selecting a Kernel to run, maps
 // the arguments, random initialize them and call the Kernel as many times as
 // requested, taking measurements and printing the result in the end.
 //
 //===----------------------------------------------------------------------===//
 
-#include "MLIRBench.h"
+#include "TPP/Runner/MLIRBench.h"
 
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
@@ -47,33 +53,15 @@
 
 using namespace mlir;
 
-// Select target GPU backend for the pipeline.
-llvm::cl::opt<std::string>
-    defGpuBackend("gpu", llvm::cl::desc("Target GPU backend for lowering"),
-                  llvm::cl::value_desc("cuda,vulkan"), llvm::cl::init(""));
-
-// Kernel buffers - arguments and return values - are expected to be allocated
-// on GPU.
-llvm::cl::opt<bool>
-    defGpuArgs("gpu-args",
-               llvm::cl::desc("Kernel buffers are allocated on GPU"),
-               llvm::cl::init(true));
-
 MLIRBench::MLIRBench(mlir::Operation *op, const MLIRBenchConfig &config)
     : builder(op->getContext()), unkLoc(builder.getUnknownLoc()) {
   seed = config.seed;
+  backend = config.backend;
   initType = config.initType;
+  offloadToDevice = config.offloadToDevice;
 
   module = dyn_cast<ModuleOp>(op);
   assert(module && "expected a 'builtin.Module' op");
-  auto *ctx = module->getContext();
-  ctx->getOrLoadDialect<tensor::TensorDialect>();
-  ctx->getOrLoadDialect<vector::VectorDialect>();
-  ctx->getOrLoadDialect<scf::SCFDialect>();
-  ctx->getOrLoadDialect<math::MathDialect>();
-  ctx->getOrLoadDialect<bufferization::BufferizationDialect>();
-  ctx->getOrLoadDialect<perf::PerfDialect>();
-  ctx->getOrLoadDialect<gpu::GPUDialect>();
 }
 
 LogicalResult MLIRBench::findKernel(StringRef name) {
@@ -187,10 +175,10 @@ LogicalResult MLIRBench::renameKernel() {
 
 Value MLIRBench::registerOnGpu(Value buf, MemRefType memRefTy) {
   // Do nothing when not using GPU
-  if (defGpuBackend.empty() || !defGpuArgs)
+  if (!offloadToDevice || !(backend == "cuda" || backend == "vulkan"))
     return buf;
 
-  if (defGpuBackend == "vulkan") {
+  if (backend == "vulkan") {
     // Copy to heap as global memory is not shared between host and device
     auto localBuf = builder.create<memref::AllocOp>(unkLoc, memRefTy);
     auto copy = builder.create<memref::CopyOp>(unkLoc, buf, localBuf);
@@ -396,7 +384,7 @@ LogicalResult MLIRBench::printResult(Operation *kernelCall) {
 
   // Kernels must return a single result
   Value result = kernelCall->getResult(0);
-  if (defGpuBackend == "cuda" && defGpuArgs) {
+  if (backend == "cuda" && offloadToDevice) {
     auto resType = cast<ShapedType>(result.getType());
     auto memrefType =
         MemRefType::get(resType.getShape(), resType.getElementType());
@@ -424,7 +412,7 @@ LogicalResult MLIRBench::printResult(Operation *kernelCall) {
   return printShapedType(result);
 }
 
-LogicalResult MLIRBench::finalize() {
+LogicalResult MLIRBench::terminate() {
   // If we created a main at all...
   // return void and add func to Module
   if (main) {
@@ -433,19 +421,6 @@ LogicalResult MLIRBench::finalize() {
     builder.create<func::ReturnOp>(unkLoc);
   }
 
-  // A set of default passes that lower any input IR to LLVM
-  PassManager passManager(module->getContext());
-
-  tpp::DefaultPipelineOptions options{defGpuBackend};
-  passManager.addPass(tpp::createDefaultPipeline(options));
-
-  auto result = passManager.run(module);
-  if (failed(result)) {
-    llvm::errs() << "ERROR: Failed to lower IR to LLVM dialect\n";
-    module->print(llvm::errs());
-    return result;
-  }
-
   return success();
 }
 
@@ -461,4 +436,4 @@ LogicalResult MLIRBench::emitError(llvm::Twine desc) {
   return module.emitError(desc);
 }
 
-std::string MLIRBench::getGPUName() { return defGpuBackend; }
+std::string MLIRBench::getGPUName() { return backend; }