Skip to content

Commit

Permalink
HPCC-31457 Add UTF-8 specific regex support using PCRE2
Browse files Browse the repository at this point in the history
  • Loading branch information
dcamper committed May 2, 2024
1 parent a703773 commit 1a033e1
Show file tree
Hide file tree
Showing 27 changed files with 2,375 additions and 1,039 deletions.
5 changes: 5 additions & 0 deletions common/deftype/deftype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2135,6 +2135,11 @@ bool isPatternType(ITypeInfo * type)
}
}

bool isUTF8Type(ITypeInfo * type)
{
return (type->getTypeCode() == type_utf8);
}

bool isUnicodeType(ITypeInfo * type)
{
switch(type->getTypeCode())
Expand Down
1 change: 1 addition & 0 deletions common/deftype/deftype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ extern DEFTYPE_API bool isSimpleStringType(ITypeInfo * type);
extern DEFTYPE_API bool isSimpleIntegralType(ITypeInfo * type);
extern DEFTYPE_API bool isIntegralType(ITypeInfo * type);
extern DEFTYPE_API bool isPatternType(ITypeInfo * type);
extern DEFTYPE_API bool isUTF8Type(ITypeInfo * type);
extern DEFTYPE_API bool isUnicodeType(ITypeInfo * type);
extern DEFTYPE_API bool isLittleEndian(ITypeInfo * type);
extern DEFTYPE_API bool isDatasetType(ITypeInfo * type);
Expand Down
53 changes: 50 additions & 3 deletions ecl/hql/hqlfold.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2700,7 +2700,31 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
if (t0 && t1 && (!c2 || t2))
{
IValue * result;
if(isUnicodeType(t0->queryType()))
if (isUTF8Type(t0->queryType()))
{
StringBuffer pattern, search;
t0->getUTF8Value(pattern);
t1->getUTF8Value(search);
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
IStrRegExprFindInstance * match = compiled->find(search, 0, search.lengthUtf8(), false);
ITypeInfo * type = expr->queryType();
if(type->getTypeCode() == type_boolean)
{
result = createBoolValue(match->found());
}
else
{
assertex(c2 && t2);
size32_t len;
char * data;
match->getMatchX(len, data, (unsigned)t2->getIntValue());
result = type->castFrom(len, data);
rtlFree(data);
}
rtlDestroyU8StrRegExprFindInstance(match);
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if(isUnicodeType(t0->queryType()))
{
unsigned plen = t0->queryType()->getStringLen();
unsigned slen = t1->queryType()->getStringLen();
Expand Down Expand Up @@ -2767,7 +2791,16 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
size32_t resultBytes;
rtlDataAttr matchResults;

if(isUnicodeType(v0->queryType()))
if (isUTF8Type(v0->queryType()))
{
StringBuffer pattern, search;
v0->getUTF8Value(pattern);
v1->getUTF8Value(search);
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), search.lengthUtf8(), search.str());
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if(isUnicodeType(v0->queryType()))
{
size32_t plen = v0->queryType()->getStringLen();
OwnedMalloc<UChar> pattern (plen+1);
Expand Down Expand Up @@ -2800,7 +2833,21 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
if (t0 && t1 && t2)
{
IValue * result;
if(isUnicodeType(t0->queryType()))
if (isUTF8Type(t0->queryType()))
{
StringBuffer pattern, search, replace;
t0->getUTF8Value(pattern);
t1->getUTF8Value(search);
t2->getUTF8Value(replace);
size32_t outlen;
char * out;
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
compiled->replace(outlen, out, search.length(), search.str(), replace.length(), replace.str());
result = createUtf8Value(outlen, out, makeUtf8Type(outlen, NULL));
rtlFree(out);
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if(isUnicodeType(t0->queryType()))
{
unsigned plen = t0->queryType()->getStringLen();
unsigned slen = t1->queryType()->getStringLen();
Expand Down
1 change: 1 addition & 0 deletions ecl/hql/hqlgram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,7 @@ class HqlGram : implements IErrorReceiver, public CInterface
void ensureString(attribute &a);
void ensureTypeCanBeIndexed(attribute &a);
void ensureUnicode(attribute &a);
void ensureUTF8(attribute &a);
void ensureData(attribute &a);
void ensureTransformTypeMatch(attribute & tattr, IHqlExpression * ds);
bool checkTransformTypeMatch(const attribute & errpos, IHqlExpression * ds, IHqlExpression * transform);
Expand Down
63 changes: 52 additions & 11 deletions ecl/hql/hqlgram.y
Original file line number Diff line number Diff line change
Expand Up @@ -6440,26 +6440,47 @@ primexpr1
}
| REGEXFIND '(' expression ',' expression regexOpt ')'
{
parser->normalizeExpression($3, type_stringorunicode, false);
parser->checkRegex($3);
if(isUnicodeType($3.queryExprType()))
if(isUTF8Type($3.queryExprType()))
{
parser->normalizeExpression($3, type_utf8, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_utf8, false);
}
else if(isUnicodeType($3.queryExprType()))
{
parser->normalizeExpression($3, type_unicode, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_unicode, false);
}
else
{
parser->normalizeExpression($3, type_string, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_string, false);
}
$$.setExpr(createValue(no_regex_find, makeBoolType(), $3.getExpr(), $5.getExpr(), $6.getExpr()));
}
| REGEXFIND '(' expression ',' expression ',' expression regexOpt ')'
{
parser->normalizeExpression($3, type_stringorunicode, false);
parser->checkRegex($3);
Owned<ITypeInfo> subType;
if(isUnicodeType($3.queryExprType()))
if(isUTF8Type($3.queryExprType()))
{
parser->normalizeExpression($3, type_utf8, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_utf8, false);
subType.setown(makeUtf8Type(UNKNOWN_LENGTH, 0));
}
else if(isUnicodeType($3.queryExprType()))
{
parser->normalizeExpression($3, type_unicode, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_unicode, false);
subType.setown(makeUnicodeType(UNKNOWN_LENGTH, 0));
}
else
{
parser->normalizeExpression($3, type_string, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_string, false);
subType.setown(makeStringType(UNKNOWN_LENGTH));
}
Expand All @@ -6468,16 +6489,25 @@ primexpr1
}
| REGEXFINDSET '(' expression ',' expression regexOpt ')'
{
parser->normalizeExpression($3, type_stringorunicode, false);
parser->checkRegex($3);
Owned<ITypeInfo> retType;
if(isUnicodeType($3.queryExprType()))
if(isUTF8Type($3.queryExprType()))
{
parser->normalizeExpression($3, type_utf8, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_utf8, false);
retType.setown(makeUtf8Type(UNKNOWN_LENGTH, $3.queryExprType()->queryLocale()));
}
else if(isUnicodeType($3.queryExprType()))
{
parser->normalizeExpression($3, type_unicode, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_unicode, false);
retType.setown(makeUnicodeType(UNKNOWN_LENGTH, $3.queryExprType()->queryLocale()));
}
else
{
parser->normalizeExpression($3, type_string, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_string, false);
retType.setown(makeStringType(UNKNOWN_LENGTH));
}
Expand All @@ -6486,16 +6516,27 @@ primexpr1
}
| REGEXREPLACE '(' expression ',' expression ',' expression regexOpt ')'
{
parser->normalizeExpression($3, type_stringorunicode, false);
Owned<ITypeInfo> retType;
if(isUnicodeType($3.queryExprType()))
if(isUTF8Type($3.queryExprType()))
{
parser->normalizeExpression($3, type_utf8, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_utf8, false);
parser->normalizeExpression($7, type_utf8, false);
retType.setown(makeUtf8Type(UNKNOWN_LENGTH, 0));
}
else if(isUnicodeType($3.queryExprType()))
{
parser->normalizeExpression($3, type_unicode, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_unicode, false);
parser->normalizeExpression($7, type_unicode, false);
retType.setown(makeUnicodeType(UNKNOWN_LENGTH, 0));
}
else
{
parser->normalizeExpression($3, type_string, false);
parser->checkRegex($3);
parser->normalizeExpression($5, type_string, false);
parser->normalizeExpression($7, type_string, false);
retType.setown(makeStringType(UNKNOWN_LENGTH));
Expand Down
22 changes: 22 additions & 0 deletions ecl/hql/hqlgram2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4703,6 +4703,9 @@ void HqlGram::normalizeExpression(attribute & exprAttr, type_t expectedType, boo
case type_unicode:
ensureUnicode(exprAttr);
break;
case type_utf8:
ensureUTF8(exprAttr);
break;
default:
throwUnexpected();
}
Expand Down Expand Up @@ -4974,6 +4977,25 @@ void HqlGram::ensureUnicode(attribute &a)
}
}

void HqlGram::ensureUTF8(attribute &a)
{
ITypeInfo *t1 = a.queryExprType();
if (t1 && !isUTF8Type(t1))
{
if (isStringType(t1) || isUnicodeType(t1))
{
Owned<ITypeInfo> utf8Type = makeUtf8Type(UNKNOWN_LENGTH, NULL);
OwnedHqlExpr value = a.getExpr();
a.setExpr(ensureExprType(value, utf8Type));
}
else
{
StringBuffer s;
reportError(ERR_TYPE_INCOMPATIBLE, a, "Incompatible types: expected UTF8, given %s", getFriendlyTypeStr(t1, s).str());
}
}
}

void HqlGram::ensureData(attribute &a)
{
ITypeInfo *t1 = a.queryExprType();
Expand Down
16 changes: 15 additions & 1 deletion ecl/hql/hqlutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10212,6 +10212,15 @@ IHqlExpression * convertSetToExpression(bool isAll, size32_t len, const void * p
presult += numUChars;
};
break;
case type_utf8:
while (presult < presult_end)
{
const size32_t numUChars = *((size32_t *) presult);
presult += sizeof(size32_t);
results.append(*createConstant(createUtf8Value((unsigned)numUChars, (const char*)presult, makeUtf8Type(numUChars, NULL))));
presult += rtlUtf8Size(numUChars, presult);
};
break;
default:
UNIMPLEMENTED;
}
Expand Down Expand Up @@ -10709,7 +10718,12 @@ IException * checkRegexSyntax(IHqlExpression * expr)
{
try
{
if (isUnicodeType(expr->queryType()))
if (isUTF8Type(expr->queryType()))
{
ICompiledStrRegExpr * compiled = rtlCreateCompiledU8StrRegExpr(rtlUtf8Length(value->getSize(), value->queryValue()), (const char *)value->queryValue(), false);
rtlDestroyCompiledU8StrRegExpr(compiled);
}
else if (isUnicodeType(expr->queryType()))
{
Owned<ITypeInfo> unknownVarUnicodeType = makeVarUnicodeType(UNKNOWN_LENGTH, nullptr);
Owned<IValue> castValue = value->castTo(unknownVarUnicodeType);
Expand Down
12 changes: 12 additions & 0 deletions ecl/hqlcpp/hqlcatom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,16 +528,22 @@ IIdAtom * regexFindXId;
IIdAtom * regexGetFindStrId;
IIdAtom * regexNewSetStrPatternId;
IIdAtom * regexNewSetUStrPatternId;
IIdAtom * regexNewSetU8StrPatternId;
IIdAtom * regexNewStrFindId;
IIdAtom * regexNewStrFoundId;
IIdAtom * regexNewStrFoundXId;
IIdAtom * regexNewStrReplaceXId;
IIdAtom * regexNewUStrFindId;
IIdAtom * regexNewU8StrFindId;
IIdAtom * regexNewUStrFoundId;
IIdAtom * regexNewU8StrFoundId;
IIdAtom * regexNewUStrFoundXId;
IIdAtom * regexNewU8StrFoundXId;
IIdAtom * regexNewUStrReplaceXId;
IIdAtom * regexNewU8StrReplaceXId;
IIdAtom * regexMatchSetId;
IIdAtom * regexUStrMatchSetId;
IIdAtom * regexU8StrMatchSetId;
IIdAtom * regexReplaceXId;
IIdAtom * registerTimerId;
IIdAtom * releaseRowId;
Expand Down Expand Up @@ -1207,16 +1213,22 @@ MODULE_INIT(INIT_PRIORITY_HQLATOM-1)
MAKEID(regexGetFindStr);
MAKEID(regexNewSetStrPattern);
MAKEID(regexNewSetUStrPattern);
MAKEID(regexNewSetU8StrPattern);
MAKEID(regexNewStrFind);
MAKEID(regexNewStrFound);
MAKEID(regexNewStrFoundX);
MAKEID(regexNewStrReplaceX);
MAKEID(regexNewUStrFind);
MAKEID(regexNewU8StrFind);
MAKEID(regexNewUStrFound);
MAKEID(regexNewU8StrFound);
MAKEID(regexNewUStrFoundX);
MAKEID(regexNewU8StrFoundX);
MAKEID(regexNewUStrReplaceX);
MAKEID(regexNewU8StrReplaceX);
MAKEID(regexMatchSet);
MAKEID(regexUStrMatchSet);
MAKEID(regexU8StrMatchSet);
MAKEID(regexReplaceX);
MAKEID(registerTimer);
MAKEID(releaseRow);
Expand Down
6 changes: 6 additions & 0 deletions ecl/hqlcpp/hqlcatom.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,16 +526,22 @@ extern IIdAtom * regexFindXId;
extern IIdAtom * regexGetFindStrId;
extern IIdAtom * regexNewSetStrPatternId;
extern IIdAtom * regexNewSetUStrPatternId;
extern IIdAtom * regexNewSetU8StrPatternId;
extern IIdAtom * regexNewStrFindId;
extern IIdAtom * regexNewStrFoundId;
extern IIdAtom * regexNewStrFoundXId;
extern IIdAtom * regexNewStrReplaceXId;
extern IIdAtom * regexNewUStrFindId;
extern IIdAtom * regexNewU8StrFindId;
extern IIdAtom * regexNewUStrFoundId;
extern IIdAtom * regexNewU8StrFoundId;
extern IIdAtom * regexNewUStrFoundXId;
extern IIdAtom * regexNewU8StrFoundXId;
extern IIdAtom * regexNewUStrReplaceXId;
extern IIdAtom * regexNewU8StrReplaceXId;
extern IIdAtom * regexMatchSetId;
extern IIdAtom * regexUStrMatchSetId;
extern IIdAtom * regexU8StrMatchSetId;
extern IIdAtom * regexReplaceXId;
extern IIdAtom * registerTimerId;
extern IIdAtom * releaseRowId;
Expand Down
2 changes: 1 addition & 1 deletion ecl/hqlcpp/hqlcpp.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -1739,7 +1739,7 @@ public:

void doBuildNewRegexFindReplace(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound);

IHqlExpression * doBuildRegexCompileInstance(BuildCtx & ctx, IHqlExpression * pattern, bool unicode, bool caseSensitive);
IHqlExpression * doBuildRegexCompileInstance(BuildCtx & ctx, IHqlExpression * pattern, ITypeInfo * stringType, bool caseSensitive);
IHqlExpression * doBuildRegexFindInstance(BuildCtx & ctx, IHqlExpression * compiled, IHqlExpression * search, bool cloneSearch);

IHqlExpression * doCreateGraphLookup(BuildCtx & declarectx, BuildCtx & resolvectx, unique_id_t id, const char * activity, bool isChild);
Expand Down
7 changes: 7 additions & 0 deletions ecl/hqlcpp/hqlcppsys.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,13 @@ const char * cppSystemText[] = {
" unicode regexNewUStrReplaceX(const unicode _search, const unicode _replace) : method,pure,entrypoint='replace',time('REGEXREPLACE');"
" set of unicode regexUStrMatchSet(const unicode _search) : method,pure,entrypoint='getMatchSet',time('REGEXFINDSET');"

" regexNewSetU8StrPattern(const utf8 _pattern, boolean isCaseSensitive) : omethod,entrypoint='setPattern',time('CompileUTF8Regex');"
" regexNewU8StrFind(boolean _compiled, const utf8 _search, boolean _cloneSearch) : omethod,entrypoint='find',time('REGEXFIND');"
" boolean regexNewU8StrFound() : method,pure,entrypoint='found';"
" utf8 regexNewU8StrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';"
" utf8 regexNewU8StrReplaceX(const utf8 _search, const utf8 _replace) : method,pure,entrypoint='replace',time('REGEXREPLACE');"
" set of utf8 regexU8StrMatchSet(const utf8 _search) : method,pure,entrypoint='getMatchSet',time('REGEXFINDSET');"

//clibrary functions that are called from the code generation
" free(noconst data1 src) : eclrtl,library='eclrtl',entrypoint='rtlFree';",
" integer4 memcmp(const data1 target, const data1 src, unsigned4 len) : sys,pure,entrypoint='memcmp';",
Expand Down
Loading

0 comments on commit 1a033e1

Please sign in to comment.