From b7f085da63ed1f6a43b673f6987597d37d11bf7f Mon Sep 17 00:00:00 2001 From: ZongtianHou Date: Thu, 12 Nov 2020 10:02:54 +0800 Subject: [PATCH] HAWQ-1771. add TRANSLATE function and set KMP_LIMIT = 30 --- .../src/dbcommon/function/func-kind.h | 1 + .../dbcommon/src/dbcommon/function/func.cc | 2 +- .../function/string-binary-function.h | 1 + .../src/dbcommon/function/string-function.cc | 131 +++++++++++++----- .../unit/function/test-string-function.cc | 66 ++++++++- 5 files changed, 167 insertions(+), 34 deletions(-) diff --git a/depends/dbcommon/src/dbcommon/function/func-kind.h b/depends/dbcommon/src/dbcommon/function/func-kind.h index 4fbf9616fe..293ffb000a 100644 --- a/depends/dbcommon/src/dbcommon/function/func-kind.h +++ b/depends/dbcommon/src/dbcommon/function/func-kind.h @@ -246,6 +246,7 @@ generate_func_kind(ARITH_OP, INTERVAL_TYPE, INTERVAL_TYPE) STRING_RPAD, STRING_LPAD_NOFILL, STRING_RPAD_NOFILL, + STRING_TRANSLATE, // binary related functions BINARY_OCTET_LENGTH, diff --git a/depends/dbcommon/src/dbcommon/function/func.cc b/depends/dbcommon/src/dbcommon/function/func.cc index f9222b3fa7..36bddd657c 100644 --- a/depends/dbcommon/src/dbcommon/function/func.cc +++ b/depends/dbcommon/src/dbcommon/function/func.cc @@ -377,7 +377,7 @@ call_function_table(ARITH_OP, INTERVAL_TYPE, INTERVAL_TYPE) FuncEntryArray.push_back({STRING_RPAD, "string_rpad", STRINGID, {STRINGID, INTID, STRINGID}, string_rpad, false}); FuncEntryArray.push_back({STRING_LPAD_NOFILL, "string_lpad_nofill", STRINGID, {STRINGID, INTID}, string_lpad_nofill, false}); FuncEntryArray.push_back({STRING_RPAD_NOFILL, "string_rpad_nofill", STRINGID, {STRINGID, INTID}, string_rpad_nofill, false}); - + FuncEntryArray.push_back({STRING_TRANSLATE, "string_translate", STRINGID, {STRINGID, STRINGID, STRINGID}, string_translate, false}); FuncEntryArray.push_back({BINARY_OCTET_LENGTH, "binary_octet_length", INTID, {BINARYID}, binary_octet_length, false}); diff --git a/depends/dbcommon/src/dbcommon/function/string-binary-function.h b/depends/dbcommon/src/dbcommon/function/string-binary-function.h index ab4499c322..8375f8affa 100644 --- a/depends/dbcommon/src/dbcommon/function/string-binary-function.h +++ b/depends/dbcommon/src/dbcommon/function/string-binary-function.h @@ -46,6 +46,7 @@ Datum string_initcap(Datum *params, uint64_t size); Datum string_ascii(Datum *params, uint64_t size); Datum string_repeat(Datum *params, uint64_t size); Datum string_chr(Datum *params, uint64_t size); +Datum string_translate(Datum *params, uint64_t size); Datum string_bpchar(Datum *params, uint64_t size); Datum string_varchar(Datum *params, uint64_t size); diff --git a/depends/dbcommon/src/dbcommon/function/string-function.cc b/depends/dbcommon/src/dbcommon/function/string-function.cc index e70a40213c..dbf11c6dad 100644 --- a/depends/dbcommon/src/dbcommon/function/string-function.cc +++ b/depends/dbcommon/src/dbcommon/function/string-function.cc @@ -351,16 +351,30 @@ class utf8ptr { return *this; } - utf8ptr &operator+=(const int &len) { - int times = len; + utf8ptr &operator+=(const int32_t &len) { + int32_t times = len; while (times--) p_ += utf8_mblen(p_); return *this; } + utf8ptr &operator=(const char *p) { + if (p_ != p) p_ = p; + return *this; + } + + bool operator==(const utf8ptr &tmp) { + int32_t len = utf8_mblen(p_); + const char *tmp_ = p_; + const char *cmp_ = tmp.p_; + while (len && *tmp_++ == *cmp_++) len--; + if (len) return false; + return true; + } + char *get() { return const_cast(p_); } - int characterLength(const char *p) { - int len = 0; + int32_t characterLength(const char *p) { + int32_t len = 0; const char *tmp = p_; while (tmp != p) { tmp += utf8_mblen(tmp); @@ -369,11 +383,11 @@ class utf8ptr { return len; } - int characterLength(const int &len) { - int ret = 0, lenth = len; + int32_t characterLength(const int32_t &len) { + int32_t ret = 0, lenth = len; const char *tmp = p_; while (lenth > 0) { - int tLen = utf8_mblen(tmp); + int32_t tLen = utf8_mblen(tmp); lenth -= tLen; tmp += tLen; ret++; @@ -381,12 +395,12 @@ class utf8ptr { return ret; } - int byteLength(const int &len) { - int ret = 0; - int times = len; + int32_t byteLength(const int32_t &len) { + int32_t ret = 0; + int32_t times = len; const char *tmp = p_; while (times--) { - int tLen = utf8_mblen(tmp); + int32_t tLen = utf8_mblen(tmp); tmp += tLen; ret += tLen; } @@ -459,7 +473,7 @@ int32_t kmpPos(const char *str, const char *subStr, uint64_t len, int32_t *__restrict__ next = reinterpret_cast(kmpPosBuf->data()); next[0] = -1; - int i = 0, j = -1; + int32_t i = 0, j = -1; while (i < subLen - 1) { if (j == -1 || subStr[i] == subStr[j]) next[++i] = ++j; @@ -469,7 +483,7 @@ int32_t kmpPos(const char *str, const char *subStr, uint64_t len, i = 0; j = 0; - int lLen = len, sLen = subLen; + int32_t lLen = len, sLen = subLen; while (i < lLen && j < sLen) { if (j == -1 || subStr[j] == str[i]) { i++; @@ -488,10 +502,10 @@ int32_t naivePos(const char *str, const char *subStr, uint64_t len, uint64_t subLen) { if (len < subLen) return 0; - int times = len - subLen; - for (int i = 0; i <= times; i++) { + int32_t times = len - subLen; + for (int32_t i = 0; i <= times; i++) { bool flag = true; - for (int j = 0; j < subLen; j++) + for (int32_t j = 0; j < subLen; j++) if (str[i + j] != subStr[j]) { flag = false; break; @@ -502,9 +516,10 @@ int32_t naivePos(const char *str, const char *subStr, uint64_t len, } Datum string_position(Datum *params, uint64_t size) { + const uint32_t KMP_LIMIT = 30; auto subpos = [](ByteBuffer &buf, text src, text sub) -> int32_t { int32_t byteLen = 0; - if (sub.length < 15) { + if (sub.length < KMP_LIMIT) { byteLen = naivePos(src.val, sub.val, src.length, sub.length); } else { dbcommon::ByteBuffer kmpPosBuf(true); @@ -522,7 +537,7 @@ Datum string_initcap(Datum *params, uint64_t size) { char *ret = const_cast(buf.tail() - str.length); char last = ' '; - int times = str.length; + int32_t times = str.length; while (times--) { if (((unsigned int)((last | 0x20) - 'a') >= 26u && (unsigned int)(last - '0') >= 10u) && @@ -591,7 +606,7 @@ Datum string_substring_nolen(Datum *params, uint64_t size) { utf8ptr utfStrPtr(str.val); utfStrPtr += pos; char *strBegin = utfStrPtr.get(); - int len = str.val + str.length - strBegin; + int32_t len = str.val + str.length - strBegin; if (len < 0) len = 0; buf.resize(buf.size() + len); char *ret = const_cast(buf.tail() - len); @@ -604,7 +619,7 @@ Datum string_substring_nolen(Datum *params, uint64_t size) { inline int32_t myAscii(const unsigned char *data) { int32_t retval = 0; if (*data > 0x7F) { - int tsize = 0; + int32_t tsize = 0; if (*data >= 0xF0) { retval = *data & 0x07; tsize = 3; @@ -674,14 +689,14 @@ enum direction { left = 0, right, both }; template Datum string_trim_blank(Datum *params, uint64_t size) { auto trim = [](ByteBuffer &buf, text str) { - int l = 0, r = str.length - 1; + int32_t l = 0, r = str.length - 1; if (dir == direction::left || dir == direction::both) { while (l <= r && str.val[l] == ' ') l++; } if (dir == direction::right || dir == direction::both) { while (l <= r && str.val[r] == ' ') r--; } - int len = r - l + 1; + int32_t len = r - l + 1; if (len < 0) len = 0; buf.resize(buf.size() + len); char *ret = const_cast(buf.tail() - len); @@ -695,7 +710,7 @@ Datum string_trim_blank(Datum *params, uint64_t size) { template Datum string_trim_chars(Datum *params, uint64_t size) { auto trim = [](ByteBuffer &buf, text str, text chr) { - int l = 0, r = str.length - 1; + int32_t l = 0, r = str.length - 1; if (dir == direction::left || dir == direction::both) { std::string s(const_cast(chr.val), chr.length); while (l <= r && s.find(str.val[l]) != std::string::npos) l++; @@ -704,7 +719,7 @@ Datum string_trim_chars(Datum *params, uint64_t size) { std::string s(const_cast(chr.val), chr.length); while (l <= r && s.find(str.val[r]) != std::string::npos) r--; } - int len = r - l + 1; + int32_t len = r - l + 1; if (len < 0) len = 0; buf.resize(buf.size() + len); char *ret = const_cast(buf.tail() - len); @@ -767,7 +782,7 @@ Datum string_repeat(Datum *params, uint64_t size) { Datum string_chr(Datum *params, uint64_t size) { auto chr = [](ByteBuffer &buf, int32_t val) { - int len = 0; + int32_t len = 0; char wch[4]; if (val > 0x7F) { if (val > 0x001fffff) { @@ -872,7 +887,7 @@ Datum string_pad_blank(Datum *params, uint64_t size) { } int32_t writeLen = str.length < retByteLen ? str.length : retByteLen; - for (int i = 0; i < writeLen; i++) *ret++ = str.val[i]; + for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i]; if (dir == direction::right) { int32_t remainder = retByteLen - str.length; @@ -904,7 +919,7 @@ Datum string_pad_chars(Datum *params, uint64_t size) { if (strCharLen >= len) { retByteLen = utfStrPtr.byteLength(len); } else { - int rem = len - strCharLen; + int32_t rem = len - strCharLen; while (rem >= filCharLen) { retByteLen += fil.length; rem -= filCharLen; @@ -922,10 +937,10 @@ Datum string_pad_chars(Datum *params, uint64_t size) { } else { while (remainder > 0) { if (remainder >= filCharLen) { - for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i]; + for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i]; } else { int32_t fillLen = utfFilPtr.byteLength(remainder); - for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i]; + for (int32_t i = 0; i < fil.length; i++) *ret++ = str.val[i]; } remainder -= filCharLen; } @@ -933,7 +948,7 @@ Datum string_pad_chars(Datum *params, uint64_t size) { } int32_t writeLen = str.length < retByteLen ? str.length : retByteLen; - for (int i = 0; i < writeLen; i++) *ret++ = str.val[i]; + for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i]; if (dir == direction::right) { int32_t remainder = len - strCharLen; @@ -943,10 +958,10 @@ Datum string_pad_chars(Datum *params, uint64_t size) { } else { while (remainder > 0) { if (remainder >= filCharLen) { - for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i]; + for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i]; } else { int32_t fillLen = utfFilPtr.byteLength(remainder); - for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i]; + for (int32_t i = 0; i < fillLen; i++) *ret++ = fil.val[i]; } remainder -= filCharLen; } @@ -966,4 +981,56 @@ Datum string_rpad(Datum *params, uint64_t size) { return string_pad_chars(params, size); } +Datum string_translate(Datum *params, uint64_t size) { + auto translate = [](ByteBuffer &buf, text str, text from, text to) { + utf8ptr utfStrPtr(str.val); + utf8ptr utfFromPtr(from.val); + utf8ptr utfToPtr(to.val); + int32_t strCharLen = utfStrPtr.characterLength(str.val + str.length); + int32_t fromCharLen = utfFromPtr.characterLength(from.val + from.length); + int32_t toCharLen = utfToPtr.characterLength(to.val + to.length); + int32_t retByteLen = 0; + int32_t worstLen = strCharLen * 4; + + // if (worstLen / 4 != strCharLen) { + // it won't appear one number which has int32_t length; + // LOG_ERROR(ERRCODE_PROGRAM_LIMIT_EXCEEDED, + // "requested length too large"); + // } + + buf.resize(buf.size() + worstLen); + char *ret = const_cast(buf.tail() - worstLen); + + auto writeByte = [&](utf8ptr src) { + char *tmp = src.get(); + int32_t len = utf8_mblen(tmp); + retByteLen += len; + for (int32_t k = 0; k < len; k++) *ret++ = *tmp++; + }; + + for (int32_t i = 0; i < strCharLen; i++) { + int32_t j = 0; + utfFromPtr = from.val; + utfToPtr = to.val; + for (; j < fromCharLen; j++) { + if (utfStrPtr == utfFromPtr) { + if (j < toCharLen) { + utfToPtr += j; + writeByte(utfToPtr); + } + break; + } + ++utfFromPtr; + } + if (j == fromCharLen) { + writeByte(utfStrPtr); + } + ++utfStrPtr; + } + buf.resize(buf.size() - (worstLen - retByteLen)); + return text(nullptr, retByteLen); + }; + return three_params_bind(params, size, translate); +} + } // namespace dbcommon diff --git a/depends/dbcommon/test/unit/function/test-string-function.cc b/depends/dbcommon/test/unit/function/test-string-function.cc index ecef323bc6..f374a2192d 100644 --- a/depends/dbcommon/test/unit/function/test-string-function.cc +++ b/depends/dbcommon/test/unit/function/test-string-function.cc @@ -1575,5 +1575,69 @@ INSTANTIATE_TEST_CASE_P( TestFunctionEntry{FuncKind::STRING_RPAD_NOFILL, "Vector: NULL NULL NULL", {"Scalar: NULL", "Vector: 4 5 6"}})); - +INSTANTIATE_TEST_CASE_P( + string_translate, TestFunction, + ::testing::Values( + TestFunctionEntry{ + FuncKind::STRING_TRANSLATE, + "Vector: 小1灵b2 真笨 1六六五 6六caa 6六cb诶诶", + {"Vector: 小a灵b通 真厉害 1二3四五 6六c西ff 6六c西ff", + "Vector: a通 厉害 二3四 ff西 f西f", "Vector: 12 笨 六六 a比 诶b"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: 1b2d,1cc1, ,bcbc,NULL,", + {"Vector{delimiter=,}: abcd,abccba,aaaa,bcbc,NULL,", + "Vector{delimiter=,}: ac,ab,a,,x,x", + "Vector{delimiter=,}: 123,1, ,b,y,y"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: 1b2d,12cc21,111,", + {"Vector{delimiter=,}: abcd,abccba,aaa,", + "Vector{delimiter=,}: ac,ab,a,x", "Scalar: 123"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: NULL,NULL,NULL,NULL", + {"Vector{delimiter=,}: abcd,abccba,aaa,", + "Vector{delimiter=,}: ac,ab,a,x", "Scalar: NULL"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: xb3d,b11b, ", + {"Vector{delimiter=,}: abcd,abccba, aaa", + "Scalar: ca", "Vector{delimiter=,}: 3x,1,"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: NULL,NULL,NULL", + {"Vector{delimiter=,}: abc,cdda,NULL", "Scalar: NULL", + "Vector{delimiter=,}: 3x,1,"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: 12cde,12cc21,1 2 c 21", + {"Vector{delimiter=,}: abcde,abccba,a b c ba", + "Scalar: ab", "Scalar: 12"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: NULL,NULL,NULL", + {"Vector{delimiter=,}: abcde,abccba,a b c ba", + "Scalar: NULL", "Scalar: 12"}}, + TestFunctionEntry{ + FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: 1b22b1,1cc1,1bccb1,bccb,abccba,NULL", + {"Scalar: abccba", "Vector{delimiter=,}: ac,ba,ad,da,,NULL", + "Vector{delimiter=,}: 123,1,12,2,1,y"}}, + TestFunctionEntry{ + FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL", + {"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL", + "Vector{delimiter=,}: 123,1,12,2,1,y"}}, + TestFunctionEntry{ + FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: a2112a,aa,NULL", + {"Scalar: abccba", "Scalar: cb", "Vector{delimiter=,}: 123,,NULL"}}, + TestFunctionEntry{FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL", + {"Scalar: abc", "Scalar: NULL", + "Vector{delimiter=,}: 123,1,12,2,1,y"}}, + TestFunctionEntry{ + FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: a2112a,ab11ba,abccba,abccba,NULL", + {"Scalar: abccba", "Vector{delimiter=,}: cb,c, ,,NULL", + "Scalar: 123"}}, + TestFunctionEntry{ + FuncKind::STRING_TRANSLATE, + "Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL", + {"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL", + "Scalar: 123"}})); } // namespace dbcommon