Skip to content

Commit

Permalink
HPCC-32306 Optimize REGEXFIND(,n) to a fixed-sized target
Browse files Browse the repository at this point in the history
  • Loading branch information
dcamper committed Sep 18, 2024
1 parent fa22875 commit 00e985d
Show file tree
Hide file tree
Showing 9 changed files with 244 additions and 15 deletions.
6 changes: 6 additions & 0 deletions ecl/hqlcpp/hqlcatom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -532,14 +532,17 @@ IIdAtom * regexNewSetU8StrPatternId;
IIdAtom * regexNewStrFindId;
IIdAtom * regexNewStrFoundId;
IIdAtom * regexNewStrFoundXId;
IIdAtom * regexNewStrFoundXFixedId;
IIdAtom * regexNewStrReplaceXId;
IIdAtom * regexNewStrReplaceFixedId;
IIdAtom * regexNewUStrFindId;
IIdAtom * regexNewU8StrFindId;
IIdAtom * regexNewUStrFoundId;
IIdAtom * regexNewU8StrFoundId;
IIdAtom * regexNewUStrFoundXId;
IIdAtom * regexNewUStrFoundXFixedId;
IIdAtom * regexNewU8StrFoundXId;
IIdAtom * regexNewU8StrFoundXFixedId;
IIdAtom * regexNewUStrReplaceXId;
IIdAtom * regexNewUStrReplaceFixedId;
IIdAtom * regexNewU8StrReplaceXId;
Expand Down Expand Up @@ -1220,14 +1223,17 @@ MODULE_INIT(INIT_PRIORITY_HQLATOM-1)
MAKEID(regexNewStrFind);
MAKEID(regexNewStrFound);
MAKEID(regexNewStrFoundX);
MAKEID(regexNewStrFoundXFixed);
MAKEID(regexNewStrReplaceX);
MAKEID(regexNewStrReplaceFixed);
MAKEID(regexNewUStrFind);
MAKEID(regexNewU8StrFind);
MAKEID(regexNewUStrFound);
MAKEID(regexNewU8StrFound);
MAKEID(regexNewUStrFoundX);
MAKEID(regexNewUStrFoundXFixed);
MAKEID(regexNewU8StrFoundX);
MAKEID(regexNewU8StrFoundXFixed);
MAKEID(regexNewUStrReplaceX);
MAKEID(regexNewUStrReplaceFixed);
MAKEID(regexNewU8StrReplaceX);
Expand Down
3 changes: 3 additions & 0 deletions ecl/hqlcpp/hqlcatom.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,14 +530,17 @@ extern IIdAtom * regexNewSetU8StrPatternId;
extern IIdAtom * regexNewStrFindId;
extern IIdAtom * regexNewStrFoundId;
extern IIdAtom * regexNewStrFoundXId;
extern IIdAtom * regexNewStrFoundXFixedId;
extern IIdAtom * regexNewStrReplaceXId;
extern IIdAtom * regexNewStrReplaceFixedId;
extern IIdAtom * regexNewUStrFindId;
extern IIdAtom * regexNewU8StrFindId;
extern IIdAtom * regexNewUStrFoundId;
extern IIdAtom * regexNewU8StrFoundId;
extern IIdAtom * regexNewUStrFoundXId;
extern IIdAtom * regexNewUStrFoundXFixedId;
extern IIdAtom * regexNewU8StrFoundXId;
extern IIdAtom * regexNewU8StrFoundXFixedId;
extern IIdAtom * regexNewUStrReplaceXId;
extern IIdAtom * regexNewUStrReplaceFixedId;
extern IIdAtom * regexNewU8StrReplaceXId;
Expand Down
3 changes: 2 additions & 1 deletion ecl/hqlcpp/hqlcpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6750,7 +6750,8 @@ void HqlCppTranslator::doBuildAssignCast(BuildCtx & ctx, const CHqlBoundTarget &
ignoreStretched = isStringType(targetType);
break;
case no_regex_replace:
// replacing into a fixed-sized target should not require a temp
case no_regex_find:
// Returning result into a fixed-sized target should not require a temp
useTemp = false;
break;
}
Expand Down
3 changes: 3 additions & 0 deletions ecl/hqlcpp/hqlcppsys.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ const char * cppSystemText[] = {
" regexNewStrFind(boolean _compiled, const string _search, boolean _cloneSearch) : omethod,entrypoint='findTimed',timer('REGEXFIND');"
" boolean regexNewStrFound() : method,pure,entrypoint='found';"
" string regexNewStrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';"
" regexNewStrFoundXFixed(noconst string _tgt, unsigned4 idx) : method,pure,entrypoint='getMatchXFixed';"
" string regexNewStrReplaceX(const string _search, const string _replace) : method,pure,entrypoint='replaceTimed',timer('REGEXREPLACE');"
" regexNewStrReplaceFixed(noconst string _tgt, const string _search, const string _replace) : method,pure,entrypoint='replaceFixedTimed',timer('REGEXREPLACE');"
" set of string regexMatchSet(const string _search) : method,pure,entrypoint='getMatchSetTimed',timer('REGEXFINDSET');"
Expand All @@ -565,6 +566,7 @@ const char * cppSystemText[] = {
" regexNewUStrFind(boolean _compiled, const unicode _search) : omethod,entrypoint='findTimed',timer('REGEXFIND');"
" boolean regexNewUStrFound() : method,pure,entrypoint='found';"
" unicode regexNewUStrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';"
" regexNewUStrFoundXFixed(noconst unicode _tgt, unsigned4 idx) : method,pure,entrypoint='getMatchXFixed';"
" unicode regexNewUStrReplaceX(const unicode _search, const unicode _replace) : method,pure,entrypoint='replaceTimed',timer('REGEXREPLACE');"
" regexNewUStrReplaceFixed(noconst unicode _tgt, const unicode _search, const unicode _replace) : method,pure,entrypoint='replaceFixedTimed',timer('REGEXREPLACE');"
" set of unicode regexUStrMatchSet(const unicode _search) : method,pure,entrypoint='getMatchSetTimed',timer('REGEXFINDSET');"
Expand All @@ -573,6 +575,7 @@ const char * cppSystemText[] = {
" regexNewU8StrFind(boolean _compiled, const utf8 _search, boolean _cloneSearch) : omethod,entrypoint='findTimed',timer('REGEXFIND');"
" boolean regexNewU8StrFound() : method,pure,entrypoint='found';"
" utf8 regexNewU8StrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';"
" regexNewU8StrFoundXFixed(noconst utf8 _tgt, unsigned4 idx) : method,pure,entrypoint='getMatchXFixed';"
" utf8 regexNewU8StrReplaceX(const utf8 _search, const utf8 _replace) : method,pure,entrypoint='replaceTimed',timer('REGEXREPLACE');"
" regexNewU8StrReplaceFixed(noconst utf8 _tgt, const utf8 _search, const utf8 _replace) : method,pure,entrypoint='replaceFixedTimed',timer('REGEXREPLACE');"
" set of utf8 regexU8StrMatchSet(const utf8 _search) : method,pure,entrypoint='getMatchSetTimed',timer('REGEXFINDSET');"
Expand Down
47 changes: 36 additions & 11 deletions ecl/hqlcpp/hqlhtcpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18728,18 +18728,43 @@ void HqlCppTranslator::doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoun
}
else
{
HqlExprArray args;
args.append(*LINK(findInstance));
args.append(*LINK(expr->queryChild(2)));
IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexNewU8StrFoundXId;
else if (isUnicodeType(searchStringType))
func = regexNewUStrFoundXId;
if (target && target->isFixedSize() && target->queryType()->getTypeCode() == expr->queryType()->getTypeCode())
{
// We need to build our arguments manually because we need to
// pass the size of the output buffer (the target) as an argument
IHqlExpression * targetVar = target->expr;
unsigned targetSize = target->queryType()->getStringLen();

HqlExprArray args;
args.append(*LINK(findInstance)); // instance on which method is called
args.append(*getSizetConstant(targetSize)); // size of the output buffer in code units
args.append(*getElementPointer(targetVar)); // pointer to the output buffer
args.append(*LINK(expr->queryChild(2))); // capture group to find and return

IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexNewU8StrFoundXFixedId;
else if (isUnicodeType(searchStringType))
func = regexNewUStrFoundXFixedId;
else
func = regexNewStrFoundXFixedId;
callProcedure(ctx, func, args);
}
else
func = regexNewStrFoundXId;
OwnedHqlExpr call = bindFunctionCall(func, args);
buildExprOrAssign(ctx, target, call, bound);
{
HqlExprArray args;
args.append(*LINK(findInstance));
args.append(*LINK(expr->queryChild(2)));
IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexNewU8StrFoundXId;
else if (isUnicodeType(searchStringType))
func = regexNewUStrFoundXId;
else
func = regexNewStrFoundXId;
OwnedHqlExpr call = bindFunctionCall(func, args);
buildExprOrAssign(ctx, target, call, bound);
}
}
}
}
Expand Down
71 changes: 68 additions & 3 deletions rtl/eclrtl/eclregex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ class CStrRegExprFindInstance final : implements IStrRegExprFindInstance
PCRE2MatchData8 matchData;
const char * subject = nullptr; // points to current subject of regex; do not free
char * sample = nullptr; //only required if findstr/findvstr will be called
bool isUTF8Enabled = false;;

public:
CStrRegExprFindInstance(std::shared_ptr<pcre2_code_8> _compiledRegex, const char * _subject, size32_t _from, size32_t _len, bool _keep)
Expand All @@ -304,10 +305,10 @@ class CStrRegExprFindInstance final : implements IStrRegExprFindInstance
// See if UTF-8 is enabled on this compiled regex
uint32_t option_bits;
pcre2_pattern_info_8(compiledRegex.get(), PCRE2_INFO_ALLOPTIONS, &option_bits);
bool utf8Enabled = (option_bits & PCRE2_UTF) != 0;
isUTF8Enabled = (option_bits & PCRE2_UTF) != 0;
// Make sure the offset and length is in code points (bytes), not characters
size32_t subjectOffset = (utf8Enabled ? rtlUtf8Size(_from, _subject) : _from);
size32_t subjectSize = (utf8Enabled ? rtlUtf8Size(_len, _subject) : _len);
size32_t subjectOffset = (isUTF8Enabled ? rtlUtf8Size(_from, _subject) : _from);
size32_t subjectSize = (isUTF8Enabled ? rtlUtf8Size(_len, _subject) : _len);

if (_keep)
{
Expand Down Expand Up @@ -363,6 +364,40 @@ class CStrRegExprFindInstance final : implements IStrRegExprFindInstance
}
}

void getMatchXFixed(size32_t tlen, char * tgt, unsigned n = 0) const
{
if (tlen == 0)
return;

if (matched && (n < pcre2_get_ovector_count_8(matchData)))
{
PCRE2_SIZE * ovector = pcre2_get_ovector_pointer_8(matchData);
const char * matchStart = subject + ovector[2 * n];
size32_t foundSize = ovector[2 * n + 1] - ovector[2 * n];
if (foundSize <= tlen)
{
memcpy_iflen(tgt, matchStart, foundSize);
memset_iflen(tgt + foundSize, ' ', tlen - foundSize);
}
else
{
if (isUTF8Enabled)
{
rtlUtf8ToUtf8(tlen, tgt, foundSize, matchStart);
}
else
{
memcpy_iflen(tgt, matchStart, tlen);
}
}
}
else
{
// Return an empty string
memset_iflen(tgt, ' ', tlen);
}
}

char const * findvstr(unsigned outlen, char * out, unsigned n)
{
if (matched && (n < pcre2_get_ovector_count_8(matchData)))
Expand Down Expand Up @@ -947,6 +982,36 @@ class CUStrRegExprFindInstance final : implements IUStrRegExprFindInstance
}
}

void getMatchXFixed(size32_t tlen, UChar * tgt, unsigned n = 0) const
{
if (tlen == 0)
return;

if (matched && (n < pcre2_get_ovector_count_16(matchData)))
{
PCRE2_SIZE * ovector = pcre2_get_ovector_pointer_16(matchData);
const UChar * matchStart = subject + ovector[2 * n];
size32_t foundSize = ovector[2 * n + 1] - ovector[2 * n];
if (foundSize <= tlen)
{
memcpy_iflen(tgt, matchStart, foundSize * sizeof(UChar));
while (foundSize < tlen)
tgt[foundSize++] = ' ';
}
else
{
memcpy_iflen(tgt, matchStart, tlen * sizeof(UChar));
}
}
else
{
// Return an empty string
size32_t pos = 0;
while (pos < tlen)
tgt[pos++] = ' ';
}
}

UChar const * findvstr(unsigned outlen, UChar * out, unsigned n)
{
if (matched && (n < pcre2_get_ovector_count_16(matchData)))
Expand Down
2 changes: 2 additions & 0 deletions rtl/eclrtl/eclrtl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ interface IStrRegExprFindInstance
{
virtual bool found() const = 0;
virtual void getMatchX(size32_t & outlen, char * & out, unsigned n = 0) const = 0;
virtual void getMatchXFixed(size32_t tlen, char * tgt, unsigned n = 0) const = 0;
};

interface ICompiledStrRegExpr
Expand All @@ -102,6 +103,7 @@ interface IUStrRegExprFindInstance
{
virtual bool found() const = 0;
virtual void getMatchX(size32_t & outlen, UChar * & out, unsigned n = 0) const = 0;
virtual void getMatchXFixed(size32_t tlen, UChar * tgt, unsigned n = 0) const = 0;
};

interface ICompiledUStrRegExpr
Expand Down
12 changes: 12 additions & 0 deletions testing/regress/ecl/key/regex_find_fixed.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<Dataset name='first_word_0'>
<Row><unbounded_s>Colorless</unbounded_s><bounded_s>Colorless </bounded_s><unbounded_u>Colorless</unbounded_u><bounded_u>Colorless </bounded_u></Row>
</Dataset>
<Dataset name='two_words_1'>
<Row><unbounded_s>Colorless</unbounded_s><bounded_s>Colorless </bounded_s><unbounded_u>Colorless</unbounded_u><bounded_u>Colorless </bounded_u></Row>
</Dataset>
<Dataset name='two_words_2'>
<Row><unbounded_s>green</unbounded_s><bounded_s>green </bounded_s><unbounded_u>green</unbounded_u><bounded_u>green </bounded_u></Row>
</Dataset>
<Dataset name='two_words_3'>
<Row><unbounded_s></unbounded_s><bounded_s> </bounded_s><unbounded_u></unbounded_u><bounded_u> </bounded_u></Row>
</Dataset>
112 changes: 112 additions & 0 deletions testing/regress/ecl/regex_find_fixed.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*##############################################################################
HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#OPTION('globalFold', FALSE);

//------------------------------------------

inDS := DATASET(['Colorless green ideas sleep furiously.'], {STRING s});

// UTF-8 not included because the concept of a fixed-length
// UTF-8 string does not make sense

// buffer_x: replacement occurs entirely within target buffer
// alloc_x: replacement requires extra temp buffer

ResLayout := RECORD
STRING unbounded_s;
STRING10 bounded_s;
UNICODE unbounded_u;
UNICODE10 bounded_u;
END;

//------------------------------------------

// Search for a word at beginning of string, return entire match
STRING first_word_0_ps := '^\\w+' : STORED('first_word_0_ps');
UNICODE first_word_0_pu := u'^\\p{L}+' : STORED('first_word_0_pu');

first_word_0 := PROJECT
(
NOFOLD(inDS),
TRANSFORM
(
ResLayout,
SELF.unbounded_s := NOFOLD(REGEXFIND(first_word_0_ps, (STRING)LEFT.s, 0)),
SELF.bounded_s := NOFOLD(REGEXFIND(first_word_0_ps, (STRING)LEFT.s, 0)),
SELF.unbounded_u := NOFOLD(REGEXFIND(first_word_0_pu, (UNICODE)LEFT.s, 0)),
SELF.bounded_u := NOFOLD(REGEXFIND(first_word_0_pu, (UNICODE)LEFT.s, 0))
)
);
OUTPUT(first_word_0, NAMED('first_word_0'));

//------------------------------------------

// Search for two words at beginning of string, return only first word
STRING two_words_1_ps := '^(\\w+) (\\w+)' : STORED('two_words_1_ps');
UNICODE two_words_1_pu := u'^(\\p{L}+) (\\p{L}+)' : STORED('two_words_1_pu');

two_words_1 := PROJECT
(
NOFOLD(inDS),
TRANSFORM
(
ResLayout,
SELF.unbounded_s := NOFOLD(REGEXFIND(two_words_1_ps, (STRING)LEFT.s, 1)),
SELF.bounded_s := NOFOLD(REGEXFIND(two_words_1_ps, (STRING)LEFT.s, 1)),
SELF.unbounded_u := NOFOLD(REGEXFIND(two_words_1_pu, (UNICODE)LEFT.s, 1)),
SELF.bounded_u := NOFOLD(REGEXFIND(two_words_1_pu, (UNICODE)LEFT.s, 1))
)
);
OUTPUT(two_words_1, NAMED('two_words_1'));

//------------------------------------------

// Search for two words at beginning of string, return only second word
STRING two_words_2_ps := '^(\\w+) (\\w+)' : STORED('two_words_2_ps');
UNICODE two_words_2_pu := u'^(\\p{L}+) (\\p{L}+)' : STORED('two_words_2_pu');

two_words_2 := PROJECT
(
NOFOLD(inDS),
TRANSFORM
(
ResLayout,
SELF.unbounded_s := NOFOLD(REGEXFIND(two_words_2_ps, (STRING)LEFT.s, 2)),
SELF.bounded_s := NOFOLD(REGEXFIND(two_words_2_ps, (STRING)LEFT.s, 2)),
SELF.unbounded_u := NOFOLD(REGEXFIND(two_words_2_pu, (UNICODE)LEFT.s, 2)),
SELF.bounded_u := NOFOLD(REGEXFIND(two_words_2_pu, (UNICODE)LEFT.s, 2))
)
);
OUTPUT(two_words_2, NAMED('two_words_2'));

//------------------------------------------

// Search for two words at beginning of string, return only third word (which does not exist)
STRING two_words_3_ps := '^(\\w+) (\\w+)' : STORED('two_words_3_ps');
UNICODE two_words_3_pu := u'^(\\p{L}+) (\\p{L}+)' : STORED('two_words_3_pu');

two_words_3 := PROJECT
(
NOFOLD(inDS),
TRANSFORM
(
ResLayout,
SELF.unbounded_s := NOFOLD(REGEXFIND(two_words_3_ps, (STRING)LEFT.s, 3)),
SELF.bounded_s := NOFOLD(REGEXFIND(two_words_3_ps, (STRING)LEFT.s, 3)),
SELF.unbounded_u := NOFOLD(REGEXFIND(two_words_3_pu, (UNICODE)LEFT.s, 3)),
SELF.bounded_u := NOFOLD(REGEXFIND(two_words_3_pu, (UNICODE)LEFT.s, 3))
)
);
OUTPUT(two_words_3, NAMED('two_words_3'));

0 comments on commit 00e985d

Please sign in to comment.