Skip to content

Commit

Permalink
[OpOptimization] Add BatchMatMul benchmark.
Browse files Browse the repository at this point in the history
  • Loading branch information
EllisLambda committed Aug 18, 2023
1 parent 5ab2b09 commit 7e7c64c
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 10 deletions.
8 changes: 8 additions & 0 deletions benchmarks/OpOptimization/MatMul/BatchMatMul.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module{
func.func @bm_batch_matmul(%a : memref<?x?x?xf32>, %b : memref<?x?x?xf32>, %c : memref<?x?x?xf32>) {
linalg.batch_matmul
ins(%a, %b: memref<?x?x?xf32>, memref<?x?x?xf32>)
outs(%c: memref<?x?x?xf32>)
return
}
}
47 changes: 47 additions & 0 deletions benchmarks/OpOptimization/MatMul/BatchMatMulBroadcast.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// The MLIR prototype of batchmatmul-optimize in buddy-opt

#map = affine_map<(d0) -> (d0 ceildiv STEP_PLACEHOLDER)>
func.func @batch_matmul_broadcast_STEP_PLACEHOLDER(%a : memref<?x?x?xf32>, %b : memref<?x?x?xf32>, %c : memref<?x?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%step = arith.constant STEP_PLACEHOLDER : index
%c0_f32 = arith.constant 0.0 : f32
%c0_f32_vec = vector.splat %c0_f32 : vector<STEP_PLACEHOLDERxf32>

%a_row = memref.dim %a, %c1 : memref<?x?x?xf32>
%a_col = memref.dim %a, %c2 : memref<?x?x?xf32>
%b_row = memref.dim %b, %c1 : memref<?x?x?xf32>
%b_col = memref.dim %b, %c2 : memref<?x?x?xf32>
%batch = memref.dim %a, %c0 : memref<?x?x?xf32>

affine.parallel (%batch_idx) = (0) to (%batch){
affine.prefetch %a[%batch_idx, %a_row, %a_col], read, locality<3>, data : memref<?x?x?xf32> //about 3% faster
affine.for %b_row_idx = 0 to %b_row {
affine.for %a_row_idx = 0 to %a_row {
affine.for %b_col_idx = 0 to #map(%b_col) {
%a_ele = affine.load %a[%batch_idx, %a_row_idx, %b_row_idx] : memref<?x?x?xf32>
%a_vec = vector.broadcast %a_ele : f32 to vector<STEP_PLACEHOLDERxf32>
// Check tail.
%b_col_cur = arith.muli %b_col_idx, %step : index
%tail_len = arith.subi %b_col, %b_col_cur : index
%tail_flag = arith.cmpi sge, %tail_len, %step : index
scf.if %tail_flag {
%b_vec = affine.vector_load %b[%batch_idx, %b_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref<?x?x?xf32>, vector<STEP_PLACEHOLDERxf32>
%c_vec = affine.vector_load %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref<?x?x?xf32>, vector<STEP_PLACEHOLDERxf32>
%result_vec = vector.fma %a_vec, %b_vec, %c_vec : vector<STEP_PLACEHOLDERxf32>
affine.vector_store %result_vec, %c[%batch_idx, %a_row_idx, %b_col_idx * STEP_PLACEHOLDER] : memref<?x?x?xf32>, vector<STEP_PLACEHOLDERxf32>
} else {
%mask_vec = vector.create_mask %tail_len : vector<STEP_PLACEHOLDERxi1>
%b_col_idx_tail = arith.muli %b_col_idx, %step : index
%b_vec_tail = vector.maskedload %b[%batch_idx, %b_row_idx, %b_col_idx_tail], %mask_vec, %c0_f32_vec : memref<?x?x?xf32>, vector<STEP_PLACEHOLDERxi1>, vector<STEP_PLACEHOLDERxf32> into vector<STEP_PLACEHOLDERxf32>
%c_vec_tail = vector.maskedload %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %c0_f32_vec : memref<?x?x?xf32>, vector<STEP_PLACEHOLDERxi1>, vector<STEP_PLACEHOLDERxf32> into vector<STEP_PLACEHOLDERxf32>
%result_vec_tail = vector.fma %a_vec, %b_vec_tail, %c_vec_tail : vector<STEP_PLACEHOLDERxf32>
vector.maskedstore %c[%batch_idx, %a_row_idx, %b_col_idx_tail], %mask_vec, %result_vec_tail : memref<?x?x?xf32>, vector<STEP_PLACEHOLDERxi1>, vector<STEP_PLACEHOLDERxf32>
}
}
}
}
}
return
}
49 changes: 49 additions & 0 deletions benchmarks/OpOptimization/MatMul/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,53 @@ add_custom_command(OUTPUT matmul-scalar.o
add_library(MatMulScalar STATIC matmul-scalar.o)
set_target_properties(MatMulScalar PROPERTIES LINKER_LANGUAGE CXX)

add_custom_command(OUTPUT batch-matmul-scalar.o
COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir |
sed 's/bm_batch_matmul/batch_matmul_scalar/' |
${LLVM_MLIR_BINARY_DIR}/mlir-opt
-convert-linalg-to-loops
-lower-affine
-convert-scf-to-cf
-convert-vector-to-llvm
-finalize-memref-to-llvm
-convert-arith-to-llvm
-llvm-request-c-wrappers
-convert-func-to-llvm
-reconcile-unrealized-casts |
${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE}
-mattr=${BUDDY_OPT_ATTR} --filetype=obj
-o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-scalar.o
)
add_library(BatchMatMulScalar STATIC batch-matmul-scalar.o)
set_target_properties(BatchMatMulScalar PROPERTIES LINKER_LANGUAGE CXX)

function(build_batch_matmul_broadcast step)
add_custom_command(OUTPUT batch-matmul-broadcast-${step}.o
COMMAND cat ${BUDDY_SOURCE_DIR}/benchmarks/OpOptimization/MatMul/BatchMatMul.mlir |
sed 's/bm_batch_matmul/batch_matmul_broadcast_${step}/g' |
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
-batchmatmul-optimize="step-placeholder=${step}"
-expand-strided-metadata
-lower-affine
-convert-vector-to-llvm
-finalize-memref-to-llvm
-convert-scf-to-cf
-convert-linalg-to-llvm
-llvm-request-c-wrappers
-convert-func-to-llvm
-reconcile-unrealized-casts |
${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
${LLVM_MLIR_BINARY_DIR}/llc -O3 -mtriple=${BUDDY_OPT_TRIPLE}
-mattr=${BUDDY_OPT_ATTR} --filetype=obj
-o ${BUDDY_BINARY_DIR}/../benchmarks/OpOptimization/MatMul/batch-matmul-broadcast-${step}.o
)
add_library(BatchMatMulBroadcast${step} STATIC batch-matmul-broadcast-${step}.o)
set_target_properties(BatchMatMulBroadcast${step} PROPERTIES LINKER_LANGUAGE CXX)
endfunction()

build_batch_matmul_broadcast(64)

add_executable(matmul-benchmark
Main.cpp
MatMulBenchmark.cpp
Expand All @@ -114,4 +161,6 @@ target_link_libraries(matmul-benchmark
MatMulBroadcast128
MatMulBroadcast256
MatMulScalar
BatchMatMulScalar
BatchMatMulBroadcast64
)
6 changes: 4 additions & 2 deletions benchmarks/OpOptimization/MatMul/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@

#include <benchmark/benchmark.h>

void verification();
void matmul_verification();
void batch_matmul_verification();

int main(int argc, char **argv) {
// Run benchmark.
::benchmark::Initialize(&argc, argv);
::benchmark::RunSpecifiedBenchmarks();
// Run correctness verification.
verification();
matmul_verification();
batch_matmul_verification();
return 0;
}
98 changes: 90 additions & 8 deletions benchmarks/OpOptimization/MatMul/MatMulBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
//
//===----------------------------------------------------------------------===//

#include <array>
#include <benchmark/benchmark.h>
#include <buddy/Core/Container.h>
#include <iostream>
Expand All @@ -27,6 +28,10 @@
#define M 64
#define N 3136
#define K 576
#define BATCH_M 16
#define BATCH_N 784
#define BATCH_K 144
#define BATCH 64

// Helper functions and variables.
namespace {
Expand Down Expand Up @@ -62,6 +67,11 @@ void _mlir_ciface_matmul_broadcast_256(MemRef<float, 2> *A, MemRef<float, 2> *B,
MemRef<float, 2> *C);
void _mlir_ciface_matmul_scalar(MemRef<float, 2> *A, MemRef<float, 2> *B,
MemRef<float, 2> *C);
void _mlir_ciface_batch_matmul_scalar(MemRef<float, 3> *A, MemRef<float, 3> *B,
MemRef<float, 3> *C);
void _mlir_ciface_batch_matmul_broadcast_64(MemRef<float, 3> *A,
MemRef<float, 3> *B,
MemRef<float, 3> *C);
}

#define DEFINE_MATMUL_BENCHMARK(name, func) \
Expand All @@ -79,6 +89,21 @@ void _mlir_ciface_matmul_scalar(MemRef<float, 2> *A, MemRef<float, 2> *B,
} \
}

#define DEFINE_BATCH_MATMUL_BENCHMARK(name, func) \
void BBATCH_M_MATMUL_##name(benchmark::State &state) { \
intptr_t sizesA[3] = {BATCH, BATCH_M, BATCH_K}; \
intptr_t sizesB[3] = {BATCH, BATCH_K, BATCH_N}; \
intptr_t sizesC[3] = {BATCH, BATCH_M, BATCH_N}; \
\
MemRef<float, 3> A(sizesA, 1.0); \
MemRef<float, 3> B(sizesB, 1.0); \
MemRef<float, 3> C(sizesC, 0); \
\
for (auto _ : state) { \
func(&A, &B, &C); \
} \
}

DEFINE_MATMUL_BENCHMARK(OCV, _mlir_ciface_matmul_ocv)
DEFINE_MATMUL_BENCHMARK(TRANSFORM, _mlir_ciface_matmul_transform)
DEFINE_MATMUL_BENCHMARK(BROADCAST_16, _mlir_ciface_matmul_broadcast_16)
Expand All @@ -87,6 +112,9 @@ DEFINE_MATMUL_BENCHMARK(BROADCAST_64, _mlir_ciface_matmul_broadcast_64)
DEFINE_MATMUL_BENCHMARK(BROADCAST_128, _mlir_ciface_matmul_broadcast_128)
DEFINE_MATMUL_BENCHMARK(BROADCAST_256, _mlir_ciface_matmul_broadcast_256)
DEFINE_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_matmul_scalar)
DEFINE_BATCH_MATMUL_BENCHMARK(SCALAR, _mlir_ciface_batch_matmul_scalar) // batch_matmul
DEFINE_BATCH_MATMUL_BENCHMARK(BROADCAST_64,
_mlir_ciface_batch_matmul_broadcast_64) // batch_matmul
} // namespace

// Register benchmark cases.
Expand All @@ -98,15 +126,18 @@ BENCHMARK(BM_MATMUL_BROADCAST_32)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_MATMUL_BROADCAST_128)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_MATMUL_BROADCAST_256)->Unit(benchmark::kMillisecond);
BENCHMARK(BBATCH_M_MATMUL_SCALAR)->Unit(benchmark::kMillisecond); // batch_matmul
BENCHMARK(BBATCH_M_MATMUL_BROADCAST_64)->Unit(benchmark::kMillisecond); // batch_matmul

/// Correctness Verification
/// The verification does not affect the performance.
/// - Set the scalar case as the criteria.
/// - Input elements are random numbers.
/// - Output elements are initialized to zero.
/// - Compare the output of various optimizations with the scalar version to
/// verify correctness.
void verification() {
// Correctness Verification
// The verification does not affect the performance.
// - Set the scalar case as the criteria.
// - Input elements are random numbers.
// - Output elements are initialized to zero.
// - Compare the output of various optimizations with the scalar version to
// verify correctness.
void matmul_verification() {
// Set the random number generator.
std::random_device rd;
std::mt19937 generator(rd());
Expand Down Expand Up @@ -206,6 +237,57 @@ void verification() {
? PASS
: FAIL)
<< std::endl;
}

void batch_matmul_verification() {
// Set the random number generator.
std::random_device rd;
std::mt19937 generator(rd());
std::uniform_int_distribution<int> distribution(1, 100);

// Set the layout sizes of input and output memref container.
intptr_t sizesA[3] = {BATCH, BATCH_M, BATCH_K};
intptr_t sizesB[3] = {BATCH, BATCH_K, BATCH_N};
intptr_t sizesC[3] = {BATCH, BATCH_M, BATCH_N};

// Generate input A and input B memref container with random numbers.
const int inputASize = BATCH * (BATCH_M) * (BATCH_K);
// float inputARand[inputASize];
auto inputARand = new std::array<float, inputASize>();
for (int i = 0; i < inputASize; ++i) {
(*inputARand)[i] = distribution(generator);
}
MemRef<float, 3> inputAMemRef(inputARand->data(), sizesA);

const int inputBSize = BATCH * (BATCH_K) * (BATCH_N);
// float inputBRand[inputBSize];
auto inputBRand = new std::array<float, inputBSize>();
for (int i = 0; i < inputBSize; ++i) {
(*inputBRand)[i] = distribution(generator);
}
MemRef<float, 3> inputBMemRef(inputBRand->data(), sizesB);

// Generate output memref container with zero.
const int outputSize = BATCH * (BATCH_M) * (BATCH_N);
MemRef<float, 3> outputScalar(sizesC, 0);
MemRef<float, 3> outputBroadcast64(sizesC, 0);

// Perform all the matmul implementation.
_mlir_ciface_batch_matmul_scalar(&inputAMemRef, &inputBMemRef, &outputScalar);
_mlir_ciface_batch_matmul_broadcast_64(&inputAMemRef, &inputBMemRef,
&outputBroadcast64);

// Get the result array.
auto resultScalar = outputScalar.getData();
auto resultBroadcast16 = outputBroadcast64.getData();

// Print the verfication result.
std::cout << "Batch Broadcast 64 case: "
<< (areArraysEqual(resultScalar, resultBroadcast16,
outputSize / BATCH)
? PASS
: FAIL)
<< std::endl;
std::cout << "-----------------------------------------------------------"
<< std::endl;
}

0 comments on commit 7e7c64c

Please sign in to comment.