Skip to content

Commit

Permalink
HPCC-31457 Add UTF-8 specific regex support using PCRE2
Browse files Browse the repository at this point in the history
  • Loading branch information
dcamper committed Apr 16, 2024
1 parent d48ce3f commit a805e2b
Show file tree
Hide file tree
Showing 18 changed files with 2,094 additions and 1,002 deletions.
5 changes: 5 additions & 0 deletions common/deftype/deftype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2135,6 +2135,11 @@ bool isPatternType(ITypeInfo * type)
}
}

bool isUTF8Type(ITypeInfo * type)
{
return (type->getTypeCode() == type_utf8);
}

bool isUnicodeType(ITypeInfo * type)
{
switch(type->getTypeCode())
Expand Down
1 change: 1 addition & 0 deletions common/deftype/deftype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ extern DEFTYPE_API bool isSimpleStringType(ITypeInfo * type);
extern DEFTYPE_API bool isSimpleIntegralType(ITypeInfo * type);
extern DEFTYPE_API bool isIntegralType(ITypeInfo * type);
extern DEFTYPE_API bool isPatternType(ITypeInfo * type);
extern DEFTYPE_API bool isUTF8Type(ITypeInfo * type);
extern DEFTYPE_API bool isUnicodeType(ITypeInfo * type);
extern DEFTYPE_API bool isLittleEndian(ITypeInfo * type);
extern DEFTYPE_API bool isDatasetType(ITypeInfo * type);
Expand Down
53 changes: 50 additions & 3 deletions ecl/hql/hqlfold.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2700,7 +2700,31 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
if (t0 && t1 && (!c2 || t2))
{
IValue * result;
if(isUnicodeType(t0->queryType()))
if (isUTF8Type(t0->queryType()))
{
StringBuffer pattern, search;
t0->getUTF8Value(pattern);
t1->getUTF8Value(search);
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
IStrRegExprFindInstance * match = compiled->find(search, 0, search.length(), false);
ITypeInfo * type = expr->queryType();
if(type->getTypeCode() == type_boolean)
{
result = createBoolValue(match->found());
}
else
{
assertex(c2 && t2);
size32_t len;
char * data;
match->getMatchX(len, data, (unsigned)t2->getIntValue());
result = type->castFrom(len, data);
rtlFree(data);
}
rtlDestroyU8StrRegExprFindInstance(match);
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if(isUnicodeType(t0->queryType()))
{
unsigned plen = t0->queryType()->getStringLen();
unsigned slen = t1->queryType()->getStringLen();
Expand Down Expand Up @@ -2767,7 +2791,16 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
size32_t resultBytes;
rtlDataAttr matchResults;

if(isUnicodeType(v0->queryType()))
if (isUTF8Type(v0->queryType()))
{
StringBuffer pattern, search;
v0->getUTF8Value(pattern);
v1->getUTF8Value(search);
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), search.length(), search.str());
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if(isUnicodeType(v0->queryType()))
{
size32_t plen = v0->queryType()->getStringLen();
OwnedMalloc<UChar> pattern (plen+1);
Expand Down Expand Up @@ -2800,7 +2833,21 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
if (t0 && t1 && t2)
{
IValue * result;
if(isUnicodeType(t0->queryType()))
if (isUTF8Type(t0->queryType()))
{
StringBuffer pattern, search, replace;
t0->getUTF8Value(pattern);
t1->getUTF8Value(search);
t2->getUTF8Value(replace);
size32_t outlen;
char * out;
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
compiled->replace(outlen, out, search.length(), search.str(), replace.length(), replace.str());
result = createUtf8Value(outlen, out, makeUtf8Type(outlen, NULL));
rtlFree(out);
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if(isUnicodeType(t0->queryType()))
{
unsigned plen = t0->queryType()->getStringLen();
unsigned slen = t1->queryType()->getStringLen();
Expand Down
9 changes: 8 additions & 1 deletion ecl/hql/hqlutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10709,7 +10709,14 @@ IException * checkRegexSyntax(IHqlExpression * expr)
{
try
{
if (isUnicodeType(expr->queryType()))
if (isUTF8Type(expr->queryType()))
{
Owned<ITypeInfo> unknownVarUTF8Type = makeUtf8Type(UNKNOWN_LENGTH, nullptr);
Owned<IValue> castValue = value->castTo(unknownVarUTF8Type);
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr((const char *)castValue->queryValue(), false);
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if (isUnicodeType(expr->queryType()))
{
Owned<ITypeInfo> unknownVarUnicodeType = makeVarUnicodeType(UNKNOWN_LENGTH, nullptr);
Owned<IValue> castValue = value->castTo(unknownVarUnicodeType);
Expand Down
12 changes: 12 additions & 0 deletions ecl/hqlcpp/hqlcatom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,16 +528,22 @@ IIdAtom * regexFindXId;
IIdAtom * regexGetFindStrId;
IIdAtom * regexNewSetStrPatternId;
IIdAtom * regexNewSetUStrPatternId;
IIdAtom * regexNewSetU8StrPatternId;
IIdAtom * regexNewStrFindId;
IIdAtom * regexNewStrFoundId;
IIdAtom * regexNewStrFoundXId;
IIdAtom * regexNewStrReplaceXId;
IIdAtom * regexNewUStrFindId;
IIdAtom * regexNewU8StrFindId;
IIdAtom * regexNewUStrFoundId;
IIdAtom * regexNewU8StrFoundId;
IIdAtom * regexNewUStrFoundXId;
IIdAtom * regexNewU8StrFoundXId;
IIdAtom * regexNewUStrReplaceXId;
IIdAtom * regexNewU8StrReplaceXId;
IIdAtom * regexMatchSetId;
IIdAtom * regexUStrMatchSetId;
IIdAtom * regexU8StrMatchSetId;
IIdAtom * regexReplaceXId;
IIdAtom * registerTimerId;
IIdAtom * releaseRowId;
Expand Down Expand Up @@ -1207,16 +1213,22 @@ MODULE_INIT(INIT_PRIORITY_HQLATOM-1)
MAKEID(regexGetFindStr);
MAKEID(regexNewSetStrPattern);
MAKEID(regexNewSetUStrPattern);
MAKEID(regexNewSetU8StrPattern);
MAKEID(regexNewStrFind);
MAKEID(regexNewStrFound);
MAKEID(regexNewStrFoundX);
MAKEID(regexNewStrReplaceX);
MAKEID(regexNewUStrFind);
MAKEID(regexNewU8StrFind);
MAKEID(regexNewUStrFound);
MAKEID(regexNewU8StrFound);
MAKEID(regexNewUStrFoundX);
MAKEID(regexNewU8StrFoundX);
MAKEID(regexNewUStrReplaceX);
MAKEID(regexNewU8StrReplaceX);
MAKEID(regexMatchSet);
MAKEID(regexUStrMatchSet);
MAKEID(regexU8StrMatchSet);
MAKEID(regexReplaceX);
MAKEID(registerTimer);
MAKEID(releaseRow);
Expand Down
6 changes: 6 additions & 0 deletions ecl/hqlcpp/hqlcatom.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,16 +526,22 @@ extern IIdAtom * regexFindXId;
extern IIdAtom * regexGetFindStrId;
extern IIdAtom * regexNewSetStrPatternId;
extern IIdAtom * regexNewSetUStrPatternId;
extern IIdAtom * regexNewSetU8StrPatternId;
extern IIdAtom * regexNewStrFindId;
extern IIdAtom * regexNewStrFoundId;
extern IIdAtom * regexNewStrFoundXId;
extern IIdAtom * regexNewStrReplaceXId;
extern IIdAtom * regexNewUStrFindId;
extern IIdAtom * regexNewU8StrFindId;
extern IIdAtom * regexNewUStrFoundId;
extern IIdAtom * regexNewU8StrFoundId;
extern IIdAtom * regexNewUStrFoundXId;
extern IIdAtom * regexNewU8StrFoundXId;
extern IIdAtom * regexNewUStrReplaceXId;
extern IIdAtom * regexNewU8StrReplaceXId;
extern IIdAtom * regexMatchSetId;
extern IIdAtom * regexUStrMatchSetId;
extern IIdAtom * regexU8StrMatchSetId;
extern IIdAtom * regexReplaceXId;
extern IIdAtom * registerTimerId;
extern IIdAtom * releaseRowId;
Expand Down
2 changes: 1 addition & 1 deletion ecl/hqlcpp/hqlcpp.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -1739,7 +1739,7 @@ public:

void doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound);

IHqlExpression * doBuildRegexCompileInstance(BuildCtx & ctx, IHqlExpression * pattern, bool unicode, bool caseSensitive);
IHqlExpression * doBuildRegexCompileInstance(BuildCtx & ctx, IHqlExpression * pattern, ITypeInfo * stringType, bool caseSensitive);
IHqlExpression * doBuildRegexFindInstance(BuildCtx & ctx, IHqlExpression * compiled, IHqlExpression * search, bool cloneSearch);

IHqlExpression * doCreateGraphLookup(BuildCtx & declarectx, BuildCtx & resolvectx, unique_id_t id, const char * activity, bool isChild);
Expand Down
7 changes: 7 additions & 0 deletions ecl/hqlcpp/hqlcppsys.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,13 @@ const char * cppSystemText[] = {
" unicode regexNewUStrReplaceX(const unicode _search, const unicode _replace) : method,pure,entrypoint='replace',time('REGEXREPLACE');"
" set of unicode regexUStrMatchSet(const unicode _search) : method,pure,entrypoint='getMatchSet',time('REGEXFINDSET');"

" regexNewSetU8StrPattern(const utf8 _pattern, boolean isCaseSensitive) : omethod,entrypoint='setPattern',time('CompileUTF8Regex');"
" regexNewU8StrFind(boolean _compiled, const utf8 _search, boolean _cloneSearch) : omethod,entrypoint='find',time('REGEXFIND');"
" boolean regexNewU8StrFound() : method,pure,entrypoint='found';"
" unicode regexNewU8StrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';"
" unicode regexNewU8StrReplaceX(const utf8 _search, const utf8 _replace) : method,pure,entrypoint='replace',time('REGEXREPLACE');"
" set of utf8 regexU8StrMatchSet(const utf8 _search) : method,pure,entrypoint='getMatchSet',time('REGEXFINDSET');"

//clibrary functions that are called from the code generation
" free(noconst data1 src) : eclrtl,library='eclrtl',entrypoint='rtlFree';",
" integer4 memcmp(const data1 target, const data1 src, unsigned4 len) : sys,pure,entrypoint='memcmp';",
Expand Down
85 changes: 68 additions & 17 deletions ecl/hqlcpp/hqlhtcpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18470,9 +18470,9 @@ ABoundActivity * HqlCppTranslator::doBuildActivityHTTP(BuildCtx & ctx, IHqlExpre

//---------------------------------------------------------------------------

IHqlExpression * HqlCppTranslator::doBuildRegexCompileInstance(BuildCtx & ctx, IHqlExpression * pattern, bool isUnicode, bool isCaseSensitive)
IHqlExpression * HqlCppTranslator::doBuildRegexCompileInstance(BuildCtx & ctx, IHqlExpression * pattern, ITypeInfo * stringType, bool isCaseSensitive)
{
OwnedHqlExpr searchKey = createAttribute(_regexInstance_Atom, LINK(pattern), createConstant(isUnicode), createConstant(isCaseSensitive));
OwnedHqlExpr searchKey = createAttribute(_regexInstance_Atom, LINK(pattern), createConstant(stringType->queryTypeName()), createConstant(isCaseSensitive));
HqlExprAssociation * match = ctx.queryMatchExpr(searchKey);
if (match)
return match->queryExpr();
Expand Down Expand Up @@ -18518,12 +18518,20 @@ IHqlExpression * HqlCppTranslator::doBuildRegexCompileInstance(BuildCtx & ctx, I

StringBuffer tempName;
getUniqueId(tempName.append("regex"));
ITypeInfo * type = makeClassType(isUnicode ? "rtlCompiledUStrRegex" : "rtlCompiledStrRegex");
ITypeInfo * type = nullptr;
if (isUTF8Type(stringType))
type = makeClassType("rtlCompiledU8StrRegex");
else if (isUnicodeType(stringType))
type = makeClassType("rtlCompiledUStrRegex");
else
type = makeClassType("rtlCompiledStrRegex");
OwnedHqlExpr regexInstance = createVariable(tempName.str(), type);
if (!initCtx)
{
OwnedITypeInfo patternType;
if (isUnicode)
if (isUTF8Type(stringType))
patternType.setown(makeUtf8Type(UNKNOWN_LENGTH, nullptr));
else if (isUnicodeType(stringType))
patternType.setown(makeVarUnicodeType(UNKNOWN_LENGTH, nullptr));
else
patternType.set(unknownVarStringType);
Expand Down Expand Up @@ -18551,7 +18559,13 @@ IHqlExpression * HqlCppTranslator::doBuildRegexCompileInstance(BuildCtx & ctx, I
args.append(*LINK(regexInstance));
args.append(*LINK(pattern));
args.append(*createConstant(isCaseSensitive));
IIdAtom * func = isUnicode ? regexNewSetUStrPatternId : regexNewSetStrPatternId;
IIdAtom * func = nullptr;
if (isUTF8Type(stringType))
func = regexNewSetU8StrPatternId;
else if (isUnicodeType(stringType))
func = regexNewSetUStrPatternId;
else
func = regexNewSetStrPatternId;
buildFunctionCall(*initCtx, func, args);
}
declareCtx->associateExpr(searchKey, regexInstance);
Expand All @@ -18566,10 +18580,16 @@ IHqlExpression * HqlCppTranslator::doBuildRegexFindInstance(BuildCtx & ctx, IHql
if (match)
return match->queryExpr();

bool isUnicode = isUnicodeType(search->queryType());
ITypeInfo * searchStringType = search->queryType();
StringBuffer tempName;
getUniqueId(tempName.append("fi"));
ITypeInfo * type = makeClassType(isUnicode ? "rtlUStrRegexFindInstance" : "rtlStrRegexFindInstance");
ITypeInfo * type = nullptr;
if (isUTF8Type(searchStringType))
type = makeClassType("rtlU8StrRegexFindInstance");
else if (isUnicodeType(searchStringType))
type = makeClassType("rtlUStrRegexFindInstance");
else
type = makeClassType("rtlStrRegexFindInstance");
OwnedHqlExpr regexInstance = createVariable(tempName.str(), type);
ctx.addDeclare(regexInstance);

Expand All @@ -18580,9 +18600,15 @@ IHqlExpression * HqlCppTranslator::doBuildRegexFindInstance(BuildCtx & ctx, IHql
args.append(*LINK(regexInstance));
args.append(*createTranslated(castCompiled));
args.append(*LINK(search));
if (!isUnicode)
if (!isUnicodeType(searchStringType))
args.append(*createConstant(cloneSearch));
IIdAtom * func = isUnicode ? regexNewUStrFindId : regexNewStrFindId;
IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexNewU8StrFindId;
else if (isUnicodeType(searchStringType))
func = regexNewUStrFindId;
else
func = regexNewStrFindId;
buildFunctionCall(ctx, func, args);
ctx.associateExpr(searchKey, regexInstance);

Expand All @@ -18603,8 +18629,8 @@ void HqlCppTranslator::doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoun

IHqlExpression * pattern = expr->queryChild(0);
IHqlExpression * search = expr->queryChild(1);
bool isUnicode = isUnicodeType(search->queryType());
IHqlExpression * compiled = doBuildRegexCompileInstance(ctx, pattern, isUnicode, !expr->hasAttribute(noCaseAtom));
ITypeInfo * searchStringType = search->queryType();
IHqlExpression * compiled = doBuildRegexCompileInstance(ctx, pattern, searchStringType, !expr->hasAttribute(noCaseAtom));

// Because the search instance is created locally, the search parameter is always going to be valid
// as long as the find instance. Only exception could be if call created a temporary class instance.
Expand All @@ -18614,7 +18640,13 @@ void HqlCppTranslator::doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoun
args.append(*LINK(compiled));
args.append(*LINK(search));
args.append(*LINK(expr->queryChild(2)));
IIdAtom * func = isUnicode ? regexNewUStrReplaceXId : regexNewStrReplaceXId;
IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexNewU8StrReplaceXId;
else if (isUnicodeType(searchStringType))
func = regexNewUStrReplaceXId;
else
func = regexNewStrReplaceXId;
OwnedHqlExpr call = bindFunctionCall(func, args);
//Need to associate???
buildExprOrAssign(ctx, target, call, bound);
Expand All @@ -18627,7 +18659,13 @@ void HqlCppTranslator::doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoun
{
HqlExprArray args;
args.append(*LINK(findInstance));
IIdAtom * func= isUnicode ? regexNewUStrFoundId : regexNewStrFoundId;
IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexNewU8StrFoundId;
else if (isUnicodeType(searchStringType))
func = regexNewUStrFoundId;
else
func = regexNewStrFoundId;
OwnedHqlExpr call = bindFunctionCall(func, args);
buildExprOrAssign(ctx, target, call, bound);
}
Expand All @@ -18636,7 +18674,13 @@ void HqlCppTranslator::doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoun
HqlExprArray args;
args.append(*LINK(findInstance));
args.append(*LINK(expr->queryChild(2)));
IIdAtom * func= isUnicode ? regexNewUStrFoundXId : regexNewStrFoundXId;
IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexNewU8StrFoundXId;
else if (isUnicodeType(searchStringType))
func = regexNewUStrFoundXId;
else
func = regexNewStrFoundXId;
OwnedHqlExpr call = bindFunctionCall(func, args);
buildExprOrAssign(ctx, target, call, bound);
}
Expand Down Expand Up @@ -18665,13 +18709,20 @@ void HqlCppTranslator::doBuildExprRegexFindSet(BuildCtx & ctx, IHqlExpression *

IHqlExpression * pattern = expr->queryChild(0);
IHqlExpression * search = expr->queryChild(1);
bool isUnicode = isUnicodeType(search->queryType());
IHqlExpression * compiled = doBuildRegexCompileInstance(ctx, pattern, isUnicode, !expr->hasAttribute(noCaseAtom));
ITypeInfo * searchStringType = search->queryType();
IHqlExpression * compiled = doBuildRegexCompileInstance(ctx, pattern, searchStringType, !expr->hasAttribute(noCaseAtom));

HqlExprArray args;
args.append(*LINK(compiled));
args.append(*LINK(search));
IIdAtom * func = isUnicode ? regexUStrMatchSetId : regexMatchSetId;
IIdAtom * func = nullptr;
if (isUTF8Type(searchStringType))
func = regexU8StrMatchSetId;
else if (isUnicodeType(searchStringType))
func = regexUStrMatchSetId;
else
func = regexMatchSetId;

OwnedHqlExpr call = bindFunctionCall(func, args);
buildExprOrAssign(ctx, NULL, call, &bound);
//REGEXFINDSET() can never return ALL - so explicitly clear it in the result.
Expand Down
Loading

0 comments on commit a805e2b

Please sign in to comment.