Skip to content

Commit

Permalink
Run each model benchmark at least 10x to amortize setup noise.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 708329672
  • Loading branch information
gonnet authored and xnnpack-bot committed Dec 27, 2024
1 parent c2bddb4 commit 72e4d09
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 62 deletions.
65 changes: 13 additions & 52 deletions bench/models/benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@

#include <benchmark/benchmark.h>

#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <iostream>
#include <memory>
#include <vector>

Expand All @@ -23,8 +21,9 @@
#include "xnnpack/subgraph.h"
#include "pthreadpool.h"

int FLAGS_num_threads = 1;
uint32_t FLAGS_xnn_runtime_flags = 0;
namespace {
static const size_t kMinIterations = 10;
} // namespace

struct ModelRuntime {
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> model;
Expand Down Expand Up @@ -116,11 +115,15 @@ static void BenchmarkInvoke(benchmark::State& state,
return;
}

for (auto _ : state) {
benchmark::utils::WipePthreadpoolL2Caches(state, model_runtime.threadpool);
if (!model_runtime.Invoke()) {
state.SkipWithError("failed to invoke runtime");
return;
while (state.KeepRunningBatch(kMinIterations)) {
for (int iter = 0; iter < kMinIterations; iter++) {
benchmark::utils::WipePthreadpoolL2Caches(state,
model_runtime.threadpool);

if (!model_runtime.Invoke()) {
state.SkipWithError("failed to invoke runtime");
return;
}
}
}

Expand Down Expand Up @@ -239,46 +242,4 @@ BENCHMARK(QD8Attention)

BENCHMARK(QS8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();

int ProcessArgs(int& argc, char**& argv) {
for (int i = 1; i < argc;) {
if (strncmp(argv[i], "--num_threads=", 14) == 0) {
FLAGS_num_threads = atoi(argv[i] + 14);
if (FLAGS_num_threads <= 0) {
std::cerr << "Invalid --num_threads: " << FLAGS_num_threads << "\n";
return 1;
}
std::copy(argv + i + 1, argv + argc, argv + i);
argc -= 1;
} else if (strncmp(argv[i], "--xnn_runtime_flags=", 20) == 0) {
const char* v = argv[i] + 20;
if (strlen(v) > 2 && strncmp(v, "0x", 2) == 0) {
FLAGS_xnn_runtime_flags = strtoul(v + 2, nullptr, 16);
} else {
FLAGS_xnn_runtime_flags = strtoul(v, nullptr, 10);
}
std::copy(argv + i + 1, argv + argc, argv + i);
argc -= 1;
} else {
++i;
}
}
return 0;
}

#ifdef BENCHMARK_ARGS_BOTTLENECK
// We are provided with a main that will call this function
extern "C" {
int BenchmarkArgBottleneck(int& argc, char**& argv) {
return ProcessArgs(argc, argv);
}
}
#else
int main(int argc, char** argv) {
::benchmark::Initialize(&argc, argv);
int status = ProcessArgs(argc, argv);
if (status != 0) return status;
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
::benchmark::RunSpecifiedBenchmarks();
}
#endif

XNN_BENCHMARK_MAIN();
52 changes: 42 additions & 10 deletions bench/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,38 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include "utils.h"

#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <mutex>

#include "xnnpack/common.h"
#include <benchmark/benchmark.h>
#include "pthreadpool.h"

#ifdef __linux__
#include <sched.h>
#include <sched.h>
#endif
#if defined(__ANDROID__) || defined(_WIN32) || defined(__CYGWIN__)
#include <malloc.h>
#include <malloc.h>
#endif
#if defined(__SSE__) || defined(__x86_64__)
#include <xmmintrin.h>
#include <xmmintrin.h>
#endif

#if XNN_ENABLE_CPUINFO
#include <cpuinfo.h>
#include <cpuinfo.h>
#endif // XNN_ENABLE_CPUINFO

#include "xnnpack/common.h"
#include "xnnpack/hardware-config.h"
#include <benchmark/benchmark.h>
#include "pthreadpool.h"

#include "utils.h"
// Common flags for all benchmarks.
int FLAGS_num_threads = 1;
uint32_t FLAGS_xnn_runtime_flags = 0;

namespace benchmark {
namespace utils {
Expand Down Expand Up @@ -88,6 +93,33 @@ void PthreadpoolClearL2Cache(void* context, size_t id) {

}; // namespace

int ProcessArgs(int& argc, char**& argv) {
for (int i = 1; i < argc;) {
if (strncmp(argv[i], "--num_threads=", 14) == 0) {
FLAGS_num_threads = atoi(argv[i] + 14);
if (FLAGS_num_threads <= 0) {
std::cerr << "Invalid --num_threads: " << FLAGS_num_threads << "\n";
return 1;
}
std::copy(argv + i + 1, argv + argc, argv + i);
argc -= 1;
} else if (strncmp(argv[i], "--xnn_runtime_flags=", 20) == 0) {
const char* v = argv[i] + 20;
if (strlen(v) > 2 && strncmp(v, "0x", 2) == 0) {
FLAGS_xnn_runtime_flags = strtoul(v + 2, nullptr, 16);
} else {
FLAGS_xnn_runtime_flags = strtoul(v, nullptr, 10);
}
std::copy(argv + i + 1, argv + argc, argv + i);
argc -= 1;
} else {
++i;
}
}
// InitGoogle(...);
return 0;
}

uint32_t PrefetchToL1(const void* ptr, size_t size) {
uint32_t step = 16;
#if XNN_ENABLE_CPUINFO
Expand Down Expand Up @@ -154,7 +186,7 @@ void DisableDenormals() {
#endif
}

// Return clockrate in Hz
// Return clock rate in Hz.
uint64_t GetCurrentCpuFrequency() {
#ifdef __linux__
int freq = 0;
Expand Down
25 changes: 25 additions & 0 deletions bench/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,34 @@
#include <benchmark/benchmark.h>
#include "pthreadpool.h"

#ifdef BENCHMARK_ARGS_BOTTLENECK
#define XNN_BENCHMARK_MAIN() \
extern "C" { \
int BenchmarkArgBottleneck(int& argc, char**& argv) { \
return benchmark::utils::ProcessArgs(argc, argv); \
} \
}
#else
#define XNN_BENCHMARK_MAIN() \
int main(int argc, char** argv) { \
::benchmark::Initialize(&argc, argv); \
int status = benchmark::utils::ProcessArgs(argc, argv); \
if (status != 0) return status; \
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
::benchmark::RunSpecifiedBenchmarks(); \
} \
int main(int, char**)
#endif // BENCHMARK_ARGS_BOTTLENECK

// Common flags for all benchmarks.
extern int FLAGS_num_threads;
extern uint32_t FLAGS_xnn_runtime_flags;

namespace benchmark {
namespace utils {

int ProcessArgs(int& argc, char**& argv);

uint32_t WipeCache();
uint32_t PrefetchToL1(const void* ptr, size_t size);

Expand Down

0 comments on commit 72e4d09

Please sign in to comment.