Skip to content

Commit

Permalink
feat(C++/regexp): add hyperscan bench
Browse files Browse the repository at this point in the history
  • Loading branch information
leon0399 committed Jun 20, 2024
1 parent b1a81bd commit 80c7361
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "c-plus-plus/lib/regex-boost"]
path = c-plus-plus/lib/regex-boost
url = https://github.com/boostorg/regex.git
[submodule "c-plus-plus/lib/regex-hyperscan"]
path = c-plus-plus/lib/regex-hyperscan
url = https://github.com/intel/hyperscan
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ RUN apt update \
curl \
wget \
libc6 \
ragel \
unzip \
gnulib \
gnupg2 \
Expand All @@ -24,6 +25,7 @@ RUN apt update \
build-essential \
ca-certificates \
libreadline-dev \
libboost-all-dev \
apt-transport-https \
software-properties-common

Expand Down
4 changes: 3 additions & 1 deletion c-plus-plus/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CXXFLAGS += -Wall -fPIC -std=c++17 -O3
INCLUDES += -Ilib/regex-boost/include
INCLUDES += -Ilib/regex-boost/include -Ilib/regex-hyperscan/src -Ilib/regex-hyperscan/bin -Ilib/regex-hyperscan/bin/lib/libhs.a
OBJECTS +=
LDFLAGS += -lstdc++ -lpthread

Expand All @@ -26,10 +26,12 @@ all: libraries $(BINARIES)

libraries:
cd ./lib/regex-boost && mkdir -p bin && cd bin && cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && cmake --build .;
cd ./lib/regex-hyperscan && mkdir -p bin && cd bin && cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && cmake --build .;

clean:
rm -f $(BINARIES)
cd ./lib/regex-boost && rm -rf bin;
cd ./lib/regex-hyperscan && rm -rf bin;

list:
@echo $(BINARIES)
Expand Down
95 changes: 95 additions & 0 deletions c-plus-plus/src/regexp/hyperscan.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#include <iostream>
#include <string>
#include <chrono>
#include <vector>
#include <fstream>
#include <filesystem>
#include "hs.h"
#include "database.h"
#include "scratch.h"

const size_t N_REGEX_ITERATIONS = 1000;
const size_t N_CHUNKS_ITERATIONS = 10;

const std::vector<std::string> rules = {
std::string("[Hh]ello [Ww]orld[!]?"),
std::string("[Hh]ello [Ww]orld[!]?$"),
std::string("[0-9][0-9][0-9][0-9][0-9]"),
std::string("[0-9][0-9][0-9][0-9][0-9]$"),
std::string("[CV]V[a-z]askjdvc[a-z0-9A-Z]"),
std::string("^[0-9][0-9]\\.[0-9][0-9]\\.[0-9][0-9]\\.[0-9][0-9]\\s[a-zA-Z]"),
std::string("^z.*$"),
std::string("^z.z.z.z.z.z.z.z.z.z.z.z.z.z*$"),
std::string("^[z][z][z][z][z][z][z][z][z][z][z][z][z][z][z]\\S*$"),
std::string("^[az]\\S\\D*$"),
};

std::vector<std::tuple<hs_database_t*, hs_scratch_t*>> regexps;

const std::vector<std::string> chunks = {
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
"10.10.10.10 cjdhjhsdclkjhasl dflaskijd flkzsjd vlkszJ cvlsJKHD CVlaskjdvcl sdjvl ksDJv lkj lkj lzkjsdf lkj lzskdjv lkzj 66678",
"hello world !",
"bar!234ahem.. 'hello world !' ..c !hello worldzzz !zzzzzzzzzzzzzzzzzzz ksajf 874r hbsdfk i7r kjasdhf ikasuwhfia7234 kwaejhfkawehf7234h zzzzzzzzzzzzzzzzzzzzzzzzz dsssssssssssssssssssssssssssssssssssssssss aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ssssssssssssssssssssssssssssssssssssssss ddddddddddddddddddddddddddddddddddddddddddddddddd ffffffffffffffffffffffffffffffffffffffffffff ggggggggggggggggggggggggggggggggggggggggggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk llllllllllllllllllllllllllllllllllllllllllllllllllll qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr ttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppiauwdhfl,asdnfliehjrsgoirtjoersahjrouh waei7yi7q34f kwejfg hello world",
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz",
};

int main(int argc, char *argv[])
{
regexps.resize(rules.size());

for (size_t i = 0; i < rules.size(); i++)
{
hs_compile_error_t *compile_err;
hs_database_t *db;
hs_scratch_t *scratch;

hs_error_t err = hs_compile(rules[i].c_str(), HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, HS_MODE_BLOCK, NULL, &db, &compile_err);
if (err != HS_SUCCESS)
{
std::cerr << "ERROR: Unable to compile pattern \"" << rules[i] << "\": " << compile_err->message << std::endl;
hs_free_compile_error(compile_err);
return -1;
}

err = hs_alloc_scratch(db, &scratch);
if (err != HS_SUCCESS)
{
std::cerr << "ERROR: Unable to allocate scratch space. Exiting." << std::endl;
hs_free_database(db);
return -1;
}

regexps[i] = std::make_tuple(db, scratch);
}

const auto start_time = std::chrono::high_resolution_clock::now();

for (size_t i = 0; i < N_REGEX_ITERATIONS; i++)
{
for (const auto &re : regexps)
{
for (size_t j = 0; j < N_CHUNKS_ITERATIONS; j++)
{
for (auto chunk : chunks)
{
hs_error_t err = hs_scan(static_cast<hs_database_t*>(std::get<0>(re)), chunk.c_str(), chunk.size(), 0, static_cast<hs_scratch_t*>(std::get<1>(re)), [](unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void *ctx) -> int {
return 0;
}, nullptr);
if (err != HS_SUCCESS)
{
std::cerr << "ERROR: Unable to scan chunk. Exiting." << std::endl;
return -1;
}
}
}
}
}

const auto end_time = std::chrono::high_resolution_clock::now();
const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();

std::cout << "Execution time: " << duration << "ms" << std::endl;

return 0;
}

0 comments on commit 80c7361

Please sign in to comment.