From 00e985dfcc15cca066d3ba00c1713d61a4f79bf7 Mon Sep 17 00:00:00 2001 From: "Dan S. Camper" Date: Tue, 17 Sep 2024 15:35:20 -0500 Subject: [PATCH] HPCC-32306 Optimize REGEXFIND(,n) to a fixed-sized target --- ecl/hqlcpp/hqlcatom.cpp | 6 + ecl/hqlcpp/hqlcatom.hpp | 3 + ecl/hqlcpp/hqlcpp.cpp | 3 +- ecl/hqlcpp/hqlcppsys.ecl | 3 + ecl/hqlcpp/hqlhtcpp.cpp | 47 ++++++-- rtl/eclrtl/eclregex.cpp | 71 +++++++++++- rtl/eclrtl/eclrtl.hpp | 2 + testing/regress/ecl/key/regex_find_fixed.xml | 12 ++ testing/regress/ecl/regex_find_fixed.ecl | 112 +++++++++++++++++++ 9 files changed, 244 insertions(+), 15 deletions(-) create mode 100644 testing/regress/ecl/key/regex_find_fixed.xml create mode 100644 testing/regress/ecl/regex_find_fixed.ecl diff --git a/ecl/hqlcpp/hqlcatom.cpp b/ecl/hqlcpp/hqlcatom.cpp index aa91c5b96d4..b4451d7d477 100644 --- a/ecl/hqlcpp/hqlcatom.cpp +++ b/ecl/hqlcpp/hqlcatom.cpp @@ -532,6 +532,7 @@ IIdAtom * regexNewSetU8StrPatternId; IIdAtom * regexNewStrFindId; IIdAtom * regexNewStrFoundId; IIdAtom * regexNewStrFoundXId; +IIdAtom * regexNewStrFoundXFixedId; IIdAtom * regexNewStrReplaceXId; IIdAtom * regexNewStrReplaceFixedId; IIdAtom * regexNewUStrFindId; @@ -539,7 +540,9 @@ IIdAtom * regexNewU8StrFindId; IIdAtom * regexNewUStrFoundId; IIdAtom * regexNewU8StrFoundId; IIdAtom * regexNewUStrFoundXId; +IIdAtom * regexNewUStrFoundXFixedId; IIdAtom * regexNewU8StrFoundXId; +IIdAtom * regexNewU8StrFoundXFixedId; IIdAtom * regexNewUStrReplaceXId; IIdAtom * regexNewUStrReplaceFixedId; IIdAtom * regexNewU8StrReplaceXId; @@ -1220,6 +1223,7 @@ MODULE_INIT(INIT_PRIORITY_HQLATOM-1) MAKEID(regexNewStrFind); MAKEID(regexNewStrFound); MAKEID(regexNewStrFoundX); + MAKEID(regexNewStrFoundXFixed); MAKEID(regexNewStrReplaceX); MAKEID(regexNewStrReplaceFixed); MAKEID(regexNewUStrFind); @@ -1227,7 +1231,9 @@ MODULE_INIT(INIT_PRIORITY_HQLATOM-1) MAKEID(regexNewUStrFound); MAKEID(regexNewU8StrFound); MAKEID(regexNewUStrFoundX); + MAKEID(regexNewUStrFoundXFixed); MAKEID(regexNewU8StrFoundX); + MAKEID(regexNewU8StrFoundXFixed); MAKEID(regexNewUStrReplaceX); MAKEID(regexNewUStrReplaceFixed); MAKEID(regexNewU8StrReplaceX); diff --git a/ecl/hqlcpp/hqlcatom.hpp b/ecl/hqlcpp/hqlcatom.hpp index bf9e5c130c8..ed6e84f0fcd 100644 --- a/ecl/hqlcpp/hqlcatom.hpp +++ b/ecl/hqlcpp/hqlcatom.hpp @@ -530,6 +530,7 @@ extern IIdAtom * regexNewSetU8StrPatternId; extern IIdAtom * regexNewStrFindId; extern IIdAtom * regexNewStrFoundId; extern IIdAtom * regexNewStrFoundXId; +extern IIdAtom * regexNewStrFoundXFixedId; extern IIdAtom * regexNewStrReplaceXId; extern IIdAtom * regexNewStrReplaceFixedId; extern IIdAtom * regexNewUStrFindId; @@ -537,7 +538,9 @@ extern IIdAtom * regexNewU8StrFindId; extern IIdAtom * regexNewUStrFoundId; extern IIdAtom * regexNewU8StrFoundId; extern IIdAtom * regexNewUStrFoundXId; +extern IIdAtom * regexNewUStrFoundXFixedId; extern IIdAtom * regexNewU8StrFoundXId; +extern IIdAtom * regexNewU8StrFoundXFixedId; extern IIdAtom * regexNewUStrReplaceXId; extern IIdAtom * regexNewUStrReplaceFixedId; extern IIdAtom * regexNewU8StrReplaceXId; diff --git a/ecl/hqlcpp/hqlcpp.cpp b/ecl/hqlcpp/hqlcpp.cpp index 9dc01e7e5da..b1d9c8122bb 100644 --- a/ecl/hqlcpp/hqlcpp.cpp +++ b/ecl/hqlcpp/hqlcpp.cpp @@ -6750,7 +6750,8 @@ void HqlCppTranslator::doBuildAssignCast(BuildCtx & ctx, const CHqlBoundTarget & ignoreStretched = isStringType(targetType); break; case no_regex_replace: - // replacing into a fixed-sized target should not require a temp + case no_regex_find: + // Returning result into a fixed-sized target should not require a temp useTemp = false; break; } diff --git a/ecl/hqlcpp/hqlcppsys.ecl b/ecl/hqlcpp/hqlcppsys.ecl index 8938288bec1..ca413910c08 100644 --- a/ecl/hqlcpp/hqlcppsys.ecl +++ b/ecl/hqlcpp/hqlcppsys.ecl @@ -557,6 +557,7 @@ const char * cppSystemText[] = { " regexNewStrFind(boolean _compiled, const string _search, boolean _cloneSearch) : omethod,entrypoint='findTimed',timer('REGEXFIND');" " boolean regexNewStrFound() : method,pure,entrypoint='found';" " string regexNewStrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';" + " regexNewStrFoundXFixed(noconst string _tgt, unsigned4 idx) : method,pure,entrypoint='getMatchXFixed';" " string regexNewStrReplaceX(const string _search, const string _replace) : method,pure,entrypoint='replaceTimed',timer('REGEXREPLACE');" " regexNewStrReplaceFixed(noconst string _tgt, const string _search, const string _replace) : method,pure,entrypoint='replaceFixedTimed',timer('REGEXREPLACE');" " set of string regexMatchSet(const string _search) : method,pure,entrypoint='getMatchSetTimed',timer('REGEXFINDSET');" @@ -565,6 +566,7 @@ const char * cppSystemText[] = { " regexNewUStrFind(boolean _compiled, const unicode _search) : omethod,entrypoint='findTimed',timer('REGEXFIND');" " boolean regexNewUStrFound() : method,pure,entrypoint='found';" " unicode regexNewUStrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';" + " regexNewUStrFoundXFixed(noconst unicode _tgt, unsigned4 idx) : method,pure,entrypoint='getMatchXFixed';" " unicode regexNewUStrReplaceX(const unicode _search, const unicode _replace) : method,pure,entrypoint='replaceTimed',timer('REGEXREPLACE');" " regexNewUStrReplaceFixed(noconst unicode _tgt, const unicode _search, const unicode _replace) : method,pure,entrypoint='replaceFixedTimed',timer('REGEXREPLACE');" " set of unicode regexUStrMatchSet(const unicode _search) : method,pure,entrypoint='getMatchSetTimed',timer('REGEXFINDSET');" @@ -573,6 +575,7 @@ const char * cppSystemText[] = { " regexNewU8StrFind(boolean _compiled, const utf8 _search, boolean _cloneSearch) : omethod,entrypoint='findTimed',timer('REGEXFIND');" " boolean regexNewU8StrFound() : method,pure,entrypoint='found';" " utf8 regexNewU8StrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';" + " regexNewU8StrFoundXFixed(noconst utf8 _tgt, unsigned4 idx) : method,pure,entrypoint='getMatchXFixed';" " utf8 regexNewU8StrReplaceX(const utf8 _search, const utf8 _replace) : method,pure,entrypoint='replaceTimed',timer('REGEXREPLACE');" " regexNewU8StrReplaceFixed(noconst utf8 _tgt, const utf8 _search, const utf8 _replace) : method,pure,entrypoint='replaceFixedTimed',timer('REGEXREPLACE');" " set of utf8 regexU8StrMatchSet(const utf8 _search) : method,pure,entrypoint='getMatchSetTimed',timer('REGEXFINDSET');" diff --git a/ecl/hqlcpp/hqlhtcpp.cpp b/ecl/hqlcpp/hqlhtcpp.cpp index 4904276a649..dd99da45897 100644 --- a/ecl/hqlcpp/hqlhtcpp.cpp +++ b/ecl/hqlcpp/hqlhtcpp.cpp @@ -18728,18 +18728,43 @@ void HqlCppTranslator::doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoun } else { - HqlExprArray args; - args.append(*LINK(findInstance)); - args.append(*LINK(expr->queryChild(2))); - IIdAtom * func = nullptr; - if (isUTF8Type(searchStringType)) - func = regexNewU8StrFoundXId; - else if (isUnicodeType(searchStringType)) - func = regexNewUStrFoundXId; + if (target && target->isFixedSize() && target->queryType()->getTypeCode() == expr->queryType()->getTypeCode()) + { + // We need to build our arguments manually because we need to + // pass the size of the output buffer (the target) as an argument + IHqlExpression * targetVar = target->expr; + unsigned targetSize = target->queryType()->getStringLen(); + + HqlExprArray args; + args.append(*LINK(findInstance)); // instance on which method is called + args.append(*getSizetConstant(targetSize)); // size of the output buffer in code units + args.append(*getElementPointer(targetVar)); // pointer to the output buffer + args.append(*LINK(expr->queryChild(2))); // capture group to find and return + + IIdAtom * func = nullptr; + if (isUTF8Type(searchStringType)) + func = regexNewU8StrFoundXFixedId; + else if (isUnicodeType(searchStringType)) + func = regexNewUStrFoundXFixedId; + else + func = regexNewStrFoundXFixedId; + callProcedure(ctx, func, args); + } else - func = regexNewStrFoundXId; - OwnedHqlExpr call = bindFunctionCall(func, args); - buildExprOrAssign(ctx, target, call, bound); + { + HqlExprArray args; + args.append(*LINK(findInstance)); + args.append(*LINK(expr->queryChild(2))); + IIdAtom * func = nullptr; + if (isUTF8Type(searchStringType)) + func = regexNewU8StrFoundXId; + else if (isUnicodeType(searchStringType)) + func = regexNewUStrFoundXId; + else + func = regexNewStrFoundXId; + OwnedHqlExpr call = bindFunctionCall(func, args); + buildExprOrAssign(ctx, target, call, bound); + } } } } diff --git a/rtl/eclrtl/eclregex.cpp b/rtl/eclrtl/eclregex.cpp index 4b098264642..5cc605c069f 100644 --- a/rtl/eclrtl/eclregex.cpp +++ b/rtl/eclrtl/eclregex.cpp @@ -296,6 +296,7 @@ class CStrRegExprFindInstance final : implements IStrRegExprFindInstance PCRE2MatchData8 matchData; const char * subject = nullptr; // points to current subject of regex; do not free char * sample = nullptr; //only required if findstr/findvstr will be called + bool isUTF8Enabled = false;; public: CStrRegExprFindInstance(std::shared_ptr _compiledRegex, const char * _subject, size32_t _from, size32_t _len, bool _keep) @@ -304,10 +305,10 @@ class CStrRegExprFindInstance final : implements IStrRegExprFindInstance // See if UTF-8 is enabled on this compiled regex uint32_t option_bits; pcre2_pattern_info_8(compiledRegex.get(), PCRE2_INFO_ALLOPTIONS, &option_bits); - bool utf8Enabled = (option_bits & PCRE2_UTF) != 0; + isUTF8Enabled = (option_bits & PCRE2_UTF) != 0; // Make sure the offset and length is in code points (bytes), not characters - size32_t subjectOffset = (utf8Enabled ? rtlUtf8Size(_from, _subject) : _from); - size32_t subjectSize = (utf8Enabled ? rtlUtf8Size(_len, _subject) : _len); + size32_t subjectOffset = (isUTF8Enabled ? rtlUtf8Size(_from, _subject) : _from); + size32_t subjectSize = (isUTF8Enabled ? rtlUtf8Size(_len, _subject) : _len); if (_keep) { @@ -363,6 +364,40 @@ class CStrRegExprFindInstance final : implements IStrRegExprFindInstance } } + void getMatchXFixed(size32_t tlen, char * tgt, unsigned n = 0) const + { + if (tlen == 0) + return; + + if (matched && (n < pcre2_get_ovector_count_8(matchData))) + { + PCRE2_SIZE * ovector = pcre2_get_ovector_pointer_8(matchData); + const char * matchStart = subject + ovector[2 * n]; + size32_t foundSize = ovector[2 * n + 1] - ovector[2 * n]; + if (foundSize <= tlen) + { + memcpy_iflen(tgt, matchStart, foundSize); + memset_iflen(tgt + foundSize, ' ', tlen - foundSize); + } + else + { + if (isUTF8Enabled) + { + rtlUtf8ToUtf8(tlen, tgt, foundSize, matchStart); + } + else + { + memcpy_iflen(tgt, matchStart, tlen); + } + } + } + else + { + // Return an empty string + memset_iflen(tgt, ' ', tlen); + } + } + char const * findvstr(unsigned outlen, char * out, unsigned n) { if (matched && (n < pcre2_get_ovector_count_8(matchData))) @@ -947,6 +982,36 @@ class CUStrRegExprFindInstance final : implements IUStrRegExprFindInstance } } + void getMatchXFixed(size32_t tlen, UChar * tgt, unsigned n = 0) const + { + if (tlen == 0) + return; + + if (matched && (n < pcre2_get_ovector_count_16(matchData))) + { + PCRE2_SIZE * ovector = pcre2_get_ovector_pointer_16(matchData); + const UChar * matchStart = subject + ovector[2 * n]; + size32_t foundSize = ovector[2 * n + 1] - ovector[2 * n]; + if (foundSize <= tlen) + { + memcpy_iflen(tgt, matchStart, foundSize * sizeof(UChar)); + while (foundSize < tlen) + tgt[foundSize++] = ' '; + } + else + { + memcpy_iflen(tgt, matchStart, tlen * sizeof(UChar)); + } + } + else + { + // Return an empty string + size32_t pos = 0; + while (pos < tlen) + tgt[pos++] = ' '; + } + } + UChar const * findvstr(unsigned outlen, UChar * out, unsigned n) { if (matched && (n < pcre2_get_ovector_count_16(matchData))) diff --git a/rtl/eclrtl/eclrtl.hpp b/rtl/eclrtl/eclrtl.hpp index 9635ae23287..8d39bc32880 100644 --- a/rtl/eclrtl/eclrtl.hpp +++ b/rtl/eclrtl/eclrtl.hpp @@ -83,6 +83,7 @@ interface IStrRegExprFindInstance { virtual bool found() const = 0; virtual void getMatchX(size32_t & outlen, char * & out, unsigned n = 0) const = 0; + virtual void getMatchXFixed(size32_t tlen, char * tgt, unsigned n = 0) const = 0; }; interface ICompiledStrRegExpr @@ -102,6 +103,7 @@ interface IUStrRegExprFindInstance { virtual bool found() const = 0; virtual void getMatchX(size32_t & outlen, UChar * & out, unsigned n = 0) const = 0; + virtual void getMatchXFixed(size32_t tlen, UChar * tgt, unsigned n = 0) const = 0; }; interface ICompiledUStrRegExpr diff --git a/testing/regress/ecl/key/regex_find_fixed.xml b/testing/regress/ecl/key/regex_find_fixed.xml new file mode 100644 index 00000000000..4c80bcd0c60 --- /dev/null +++ b/testing/regress/ecl/key/regex_find_fixed.xml @@ -0,0 +1,12 @@ + + ColorlessColorless ColorlessColorless + + + ColorlessColorless ColorlessColorless + + + greengreen greengreen + + + + diff --git a/testing/regress/ecl/regex_find_fixed.ecl b/testing/regress/ecl/regex_find_fixed.ecl new file mode 100644 index 00000000000..31b0f4cc961 --- /dev/null +++ b/testing/regress/ecl/regex_find_fixed.ecl @@ -0,0 +1,112 @@ +/*############################################################################## + + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +#OPTION('globalFold', FALSE); + +//------------------------------------------ + +inDS := DATASET(['Colorless green ideas sleep furiously.'], {STRING s}); + +// UTF-8 not included because the concept of a fixed-length +// UTF-8 string does not make sense + +// buffer_x: replacement occurs entirely within target buffer +// alloc_x: replacement requires extra temp buffer + +ResLayout := RECORD + STRING unbounded_s; + STRING10 bounded_s; + UNICODE unbounded_u; + UNICODE10 bounded_u; +END; + +//------------------------------------------ + +// Search for a word at beginning of string, return entire match +STRING first_word_0_ps := '^\\w+' : STORED('first_word_0_ps'); +UNICODE first_word_0_pu := u'^\\p{L}+' : STORED('first_word_0_pu'); + +first_word_0 := PROJECT + ( + NOFOLD(inDS), + TRANSFORM + ( + ResLayout, + SELF.unbounded_s := NOFOLD(REGEXFIND(first_word_0_ps, (STRING)LEFT.s, 0)), + SELF.bounded_s := NOFOLD(REGEXFIND(first_word_0_ps, (STRING)LEFT.s, 0)), + SELF.unbounded_u := NOFOLD(REGEXFIND(first_word_0_pu, (UNICODE)LEFT.s, 0)), + SELF.bounded_u := NOFOLD(REGEXFIND(first_word_0_pu, (UNICODE)LEFT.s, 0)) + ) + ); +OUTPUT(first_word_0, NAMED('first_word_0')); + +//------------------------------------------ + +// Search for two words at beginning of string, return only first word +STRING two_words_1_ps := '^(\\w+) (\\w+)' : STORED('two_words_1_ps'); +UNICODE two_words_1_pu := u'^(\\p{L}+) (\\p{L}+)' : STORED('two_words_1_pu'); + +two_words_1 := PROJECT + ( + NOFOLD(inDS), + TRANSFORM + ( + ResLayout, + SELF.unbounded_s := NOFOLD(REGEXFIND(two_words_1_ps, (STRING)LEFT.s, 1)), + SELF.bounded_s := NOFOLD(REGEXFIND(two_words_1_ps, (STRING)LEFT.s, 1)), + SELF.unbounded_u := NOFOLD(REGEXFIND(two_words_1_pu, (UNICODE)LEFT.s, 1)), + SELF.bounded_u := NOFOLD(REGEXFIND(two_words_1_pu, (UNICODE)LEFT.s, 1)) + ) + ); +OUTPUT(two_words_1, NAMED('two_words_1')); + +//------------------------------------------ + +// Search for two words at beginning of string, return only second word +STRING two_words_2_ps := '^(\\w+) (\\w+)' : STORED('two_words_2_ps'); +UNICODE two_words_2_pu := u'^(\\p{L}+) (\\p{L}+)' : STORED('two_words_2_pu'); + +two_words_2 := PROJECT + ( + NOFOLD(inDS), + TRANSFORM + ( + ResLayout, + SELF.unbounded_s := NOFOLD(REGEXFIND(two_words_2_ps, (STRING)LEFT.s, 2)), + SELF.bounded_s := NOFOLD(REGEXFIND(two_words_2_ps, (STRING)LEFT.s, 2)), + SELF.unbounded_u := NOFOLD(REGEXFIND(two_words_2_pu, (UNICODE)LEFT.s, 2)), + SELF.bounded_u := NOFOLD(REGEXFIND(two_words_2_pu, (UNICODE)LEFT.s, 2)) + ) + ); +OUTPUT(two_words_2, NAMED('two_words_2')); + +//------------------------------------------ + +// Search for two words at beginning of string, return only third word (which does not exist) +STRING two_words_3_ps := '^(\\w+) (\\w+)' : STORED('two_words_3_ps'); +UNICODE two_words_3_pu := u'^(\\p{L}+) (\\p{L}+)' : STORED('two_words_3_pu'); + +two_words_3 := PROJECT + ( + NOFOLD(inDS), + TRANSFORM + ( + ResLayout, + SELF.unbounded_s := NOFOLD(REGEXFIND(two_words_3_ps, (STRING)LEFT.s, 3)), + SELF.bounded_s := NOFOLD(REGEXFIND(two_words_3_ps, (STRING)LEFT.s, 3)), + SELF.unbounded_u := NOFOLD(REGEXFIND(two_words_3_pu, (UNICODE)LEFT.s, 3)), + SELF.bounded_u := NOFOLD(REGEXFIND(two_words_3_pu, (UNICODE)LEFT.s, 3)) + ) + ); +OUTPUT(two_words_3, NAMED('two_words_3'));