diff --git a/.gitmodules b/.gitmodules index b23d6ab..585f6e8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "c-plus-plus/lib/regex-boost"] path = c-plus-plus/lib/regex-boost url = https://github.com/boostorg/regex.git +[submodule "c-plus-plus/lib/regex-hyperscan"] + path = c-plus-plus/lib/regex-hyperscan + url = https://github.com/intel/hyperscan diff --git a/Dockerfile b/Dockerfile index b76b7ce..b0e5415 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,7 @@ RUN apt update \ curl \ wget \ libc6 \ + ragel \ unzip \ gnulib \ gnupg2 \ @@ -24,6 +25,7 @@ RUN apt update \ build-essential \ ca-certificates \ libreadline-dev \ + libboost-all-dev \ apt-transport-https \ software-properties-common diff --git a/c-plus-plus/Makefile b/c-plus-plus/Makefile index 7d1b3a8..4924029 100644 --- a/c-plus-plus/Makefile +++ b/c-plus-plus/Makefile @@ -1,5 +1,5 @@ CXXFLAGS += -Wall -fPIC -std=c++17 -O3 -INCLUDES += -Ilib/regex-boost/include +INCLUDES += -Ilib/regex-boost/include -Ilib/regex-hyperscan/src -Ilib/regex-hyperscan/bin -Ilib/regex-hyperscan/bin/lib/libhs.a OBJECTS += LDFLAGS += -lstdc++ -lpthread @@ -26,10 +26,12 @@ all: libraries $(BINARIES) libraries: cd ./lib/regex-boost && mkdir -p bin && cd bin && cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && cmake --build .; + cd ./lib/regex-hyperscan && mkdir -p bin && cd bin && cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && cmake --build .; clean: rm -f $(BINARIES) cd ./lib/regex-boost && rm -rf bin; + cd ./lib/regex-hyperscan && rm -rf bin; list: @echo $(BINARIES) diff --git a/c-plus-plus/src/regexp/hyperscan.cpp b/c-plus-plus/src/regexp/hyperscan.cpp new file mode 100644 index 0000000..684eb2a --- /dev/null +++ b/c-plus-plus/src/regexp/hyperscan.cpp @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include +#include "hs.h" +#include "database.h" +#include "scratch.h" + +const size_t N_REGEX_ITERATIONS = 1000; +const size_t N_CHUNKS_ITERATIONS = 10; + +const std::vector rules = { + std::string("[Hh]ello [Ww]orld[!]?"), + std::string("[Hh]ello [Ww]orld[!]?$"), + std::string("[0-9][0-9][0-9][0-9][0-9]"), + std::string("[0-9][0-9][0-9][0-9][0-9]$"), + std::string("[CV]V[a-z]askjdvc[a-z0-9A-Z]"), + std::string("^[0-9][0-9]\\.[0-9][0-9]\\.[0-9][0-9]\\.[0-9][0-9]\\s[a-zA-Z]"), + std::string("^z.*$"), + std::string("^z.z.z.z.z.z.z.z.z.z.z.z.z.z*$"), + std::string("^[z][z][z][z][z][z][z][z][z][z][z][z][z][z][z]\\S*$"), + std::string("^[az]\\S\\D*$"), +}; + +std::vector> regexps; + +const std::vector chunks = { + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "10.10.10.10 cjdhjhsdclkjhasl dflaskijd flkzsjd vlkszJ cvlsJKHD CVlaskjdvcl sdjvl ksDJv lkj lkj lzkjsdf lkj lzskdjv lkzj 66678", + "hello world !", + "bar!234ahem.. 'hello world !' ..c !hello worldzzz !zzzzzzzzzzzzzzzzzzz ksajf 874r hbsdfk i7r kjasdhf ikasuwhfia7234 kwaejhfkawehf7234h zzzzzzzzzzzzzzzzzzzzzzzzz dsssssssssssssssssssssssssssssssssssssssss aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ssssssssssssssssssssssssssssssssssssssss ddddddddddddddddddddddddddddddddddddddddddddddddd ffffffffffffffffffffffffffffffffffffffffffff ggggggggggggggggggggggggggggggggggggggggggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk llllllllllllllllllllllllllllllllllllllllllllllllllll qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr ttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppiauwdhfl,asdnfliehjrsgoirtjoersahjrouh waei7yi7q34f kwejfg hello world", + "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", +}; + +int main(int argc, char *argv[]) +{ + regexps.resize(rules.size()); + + for (size_t i = 0; i < rules.size(); i++) + { + hs_compile_error_t *compile_err; + hs_database_t *db; + hs_scratch_t *scratch; + + hs_error_t err = hs_compile(rules[i].c_str(), HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH, HS_MODE_BLOCK, NULL, &db, &compile_err); + if (err != HS_SUCCESS) + { + std::cerr << "ERROR: Unable to compile pattern \"" << rules[i] << "\": " << compile_err->message << std::endl; + hs_free_compile_error(compile_err); + return -1; + } + + err = hs_alloc_scratch(db, &scratch); + if (err != HS_SUCCESS) + { + std::cerr << "ERROR: Unable to allocate scratch space. Exiting." << std::endl; + hs_free_database(db); + return -1; + } + + regexps[i] = std::make_tuple(db, scratch); + } + + const auto start_time = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < N_REGEX_ITERATIONS; i++) + { + for (const auto &re : regexps) + { + for (size_t j = 0; j < N_CHUNKS_ITERATIONS; j++) + { + for (auto chunk : chunks) + { + hs_error_t err = hs_scan(static_cast(std::get<0>(re)), chunk.c_str(), chunk.size(), 0, static_cast(std::get<1>(re)), [](unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void *ctx) -> int { + return 0; + }, nullptr); + if (err != HS_SUCCESS) + { + std::cerr << "ERROR: Unable to scan chunk. Exiting." << std::endl; + return -1; + } + } + } + } + } + + const auto end_time = std::chrono::high_resolution_clock::now(); + const auto duration = std::chrono::duration_cast(end_time - start_time).count(); + + std::cout << "Execution time: " << duration << "ms" << std::endl; + + return 0; +} \ No newline at end of file