Skip to content
This repository has been archived by the owner on Jul 23, 2024. It is now read-only.

Commit

Permalink
HAWQ-1771. add TRANSLATE function and set KMP_LIMIT = 30
Browse files Browse the repository at this point in the history
  • Loading branch information
Librago committed Nov 12, 2020
1 parent 6b045e7 commit b7f085d
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 34 deletions.
1 change: 1 addition & 0 deletions depends/dbcommon/src/dbcommon/function/func-kind.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ generate_func_kind(ARITH_OP, INTERVAL_TYPE, INTERVAL_TYPE)
STRING_RPAD,
STRING_LPAD_NOFILL,
STRING_RPAD_NOFILL,
STRING_TRANSLATE,

// binary related functions
BINARY_OCTET_LENGTH,
Expand Down
2 changes: 1 addition & 1 deletion depends/dbcommon/src/dbcommon/function/func.cc
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ call_function_table(ARITH_OP, INTERVAL_TYPE, INTERVAL_TYPE)
FuncEntryArray.push_back({STRING_RPAD, "string_rpad", STRINGID, {STRINGID, INTID, STRINGID}, string_rpad, false});
FuncEntryArray.push_back({STRING_LPAD_NOFILL, "string_lpad_nofill", STRINGID, {STRINGID, INTID}, string_lpad_nofill, false});
FuncEntryArray.push_back({STRING_RPAD_NOFILL, "string_rpad_nofill", STRINGID, {STRINGID, INTID}, string_rpad_nofill, false});

FuncEntryArray.push_back({STRING_TRANSLATE, "string_translate", STRINGID, {STRINGID, STRINGID, STRINGID}, string_translate, false});

FuncEntryArray.push_back({BINARY_OCTET_LENGTH, "binary_octet_length", INTID, {BINARYID}, binary_octet_length, false});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Datum string_initcap(Datum *params, uint64_t size);
Datum string_ascii(Datum *params, uint64_t size);
Datum string_repeat(Datum *params, uint64_t size);
Datum string_chr(Datum *params, uint64_t size);
Datum string_translate(Datum *params, uint64_t size);

Datum string_bpchar(Datum *params, uint64_t size);
Datum string_varchar(Datum *params, uint64_t size);
Expand Down
131 changes: 99 additions & 32 deletions depends/dbcommon/src/dbcommon/function/string-function.cc
Original file line number Diff line number Diff line change
Expand Up @@ -351,16 +351,30 @@ class utf8ptr {
return *this;
}

utf8ptr &operator+=(const int &len) {
int times = len;
utf8ptr &operator+=(const int32_t &len) {
int32_t times = len;
while (times--) p_ += utf8_mblen(p_);
return *this;
}

utf8ptr &operator=(const char *p) {
if (p_ != p) p_ = p;
return *this;
}

bool operator==(const utf8ptr &tmp) {
int32_t len = utf8_mblen(p_);
const char *tmp_ = p_;
const char *cmp_ = tmp.p_;
while (len && *tmp_++ == *cmp_++) len--;
if (len) return false;
return true;
}

char *get() { return const_cast<char *>(p_); }

int characterLength(const char *p) {
int len = 0;
int32_t characterLength(const char *p) {
int32_t len = 0;
const char *tmp = p_;
while (tmp != p) {
tmp += utf8_mblen(tmp);
Expand All @@ -369,24 +383,24 @@ class utf8ptr {
return len;
}

int characterLength(const int &len) {
int ret = 0, lenth = len;
int32_t characterLength(const int32_t &len) {
int32_t ret = 0, lenth = len;
const char *tmp = p_;
while (lenth > 0) {
int tLen = utf8_mblen(tmp);
int32_t tLen = utf8_mblen(tmp);
lenth -= tLen;
tmp += tLen;
ret++;
}
return ret;
}

int byteLength(const int &len) {
int ret = 0;
int times = len;
int32_t byteLength(const int32_t &len) {
int32_t ret = 0;
int32_t times = len;
const char *tmp = p_;
while (times--) {
int tLen = utf8_mblen(tmp);
int32_t tLen = utf8_mblen(tmp);
tmp += tLen;
ret += tLen;
}
Expand Down Expand Up @@ -459,7 +473,7 @@ int32_t kmpPos(const char *str, const char *subStr, uint64_t len,
int32_t *__restrict__ next = reinterpret_cast<int32_t *>(kmpPosBuf->data());

next[0] = -1;
int i = 0, j = -1;
int32_t i = 0, j = -1;
while (i < subLen - 1) {
if (j == -1 || subStr[i] == subStr[j])
next[++i] = ++j;
Expand All @@ -469,7 +483,7 @@ int32_t kmpPos(const char *str, const char *subStr, uint64_t len,

i = 0;
j = 0;
int lLen = len, sLen = subLen;
int32_t lLen = len, sLen = subLen;
while (i < lLen && j < sLen) {
if (j == -1 || subStr[j] == str[i]) {
i++;
Expand All @@ -488,10 +502,10 @@ int32_t naivePos(const char *str, const char *subStr, uint64_t len,
uint64_t subLen) {
if (len < subLen) return 0;

int times = len - subLen;
for (int i = 0; i <= times; i++) {
int32_t times = len - subLen;
for (int32_t i = 0; i <= times; i++) {
bool flag = true;
for (int j = 0; j < subLen; j++)
for (int32_t j = 0; j < subLen; j++)
if (str[i + j] != subStr[j]) {
flag = false;
break;
Expand All @@ -502,9 +516,10 @@ int32_t naivePos(const char *str, const char *subStr, uint64_t len,
}

Datum string_position(Datum *params, uint64_t size) {
const uint32_t KMP_LIMIT = 30;
auto subpos = [](ByteBuffer &buf, text src, text sub) -> int32_t {
int32_t byteLen = 0;
if (sub.length < 15) {
if (sub.length < KMP_LIMIT) {
byteLen = naivePos(src.val, sub.val, src.length, sub.length);
} else {
dbcommon::ByteBuffer kmpPosBuf(true);
Expand All @@ -522,7 +537,7 @@ Datum string_initcap(Datum *params, uint64_t size) {
char *ret = const_cast<char *>(buf.tail() - str.length);

char last = ' ';
int times = str.length;
int32_t times = str.length;
while (times--) {
if (((unsigned int)((last | 0x20) - 'a') >= 26u &&
(unsigned int)(last - '0') >= 10u) &&
Expand Down Expand Up @@ -591,7 +606,7 @@ Datum string_substring_nolen(Datum *params, uint64_t size) {
utf8ptr utfStrPtr(str.val);
utfStrPtr += pos;
char *strBegin = utfStrPtr.get();
int len = str.val + str.length - strBegin;
int32_t len = str.val + str.length - strBegin;
if (len < 0) len = 0;
buf.resize(buf.size() + len);
char *ret = const_cast<char *>(buf.tail() - len);
Expand All @@ -604,7 +619,7 @@ Datum string_substring_nolen(Datum *params, uint64_t size) {
inline int32_t myAscii(const unsigned char *data) {
int32_t retval = 0;
if (*data > 0x7F) {
int tsize = 0;
int32_t tsize = 0;
if (*data >= 0xF0) {
retval = *data & 0x07;
tsize = 3;
Expand Down Expand Up @@ -674,14 +689,14 @@ enum direction { left = 0, right, both };
template <direction dir>
Datum string_trim_blank(Datum *params, uint64_t size) {
auto trim = [](ByteBuffer &buf, text str) {
int l = 0, r = str.length - 1;
int32_t l = 0, r = str.length - 1;
if (dir == direction::left || dir == direction::both) {
while (l <= r && str.val[l] == ' ') l++;
}
if (dir == direction::right || dir == direction::both) {
while (l <= r && str.val[r] == ' ') r--;
}
int len = r - l + 1;
int32_t len = r - l + 1;
if (len < 0) len = 0;
buf.resize(buf.size() + len);
char *ret = const_cast<char *>(buf.tail() - len);
Expand All @@ -695,7 +710,7 @@ Datum string_trim_blank(Datum *params, uint64_t size) {
template <direction dir>
Datum string_trim_chars(Datum *params, uint64_t size) {
auto trim = [](ByteBuffer &buf, text str, text chr) {
int l = 0, r = str.length - 1;
int32_t l = 0, r = str.length - 1;
if (dir == direction::left || dir == direction::both) {
std::string s(const_cast<char *>(chr.val), chr.length);
while (l <= r && s.find(str.val[l]) != std::string::npos) l++;
Expand All @@ -704,7 +719,7 @@ Datum string_trim_chars(Datum *params, uint64_t size) {
std::string s(const_cast<char *>(chr.val), chr.length);
while (l <= r && s.find(str.val[r]) != std::string::npos) r--;
}
int len = r - l + 1;
int32_t len = r - l + 1;
if (len < 0) len = 0;
buf.resize(buf.size() + len);
char *ret = const_cast<char *>(buf.tail() - len);
Expand Down Expand Up @@ -767,7 +782,7 @@ Datum string_repeat(Datum *params, uint64_t size) {

Datum string_chr(Datum *params, uint64_t size) {
auto chr = [](ByteBuffer &buf, int32_t val) {
int len = 0;
int32_t len = 0;
char wch[4];
if (val > 0x7F) {
if (val > 0x001fffff) {
Expand Down Expand Up @@ -872,7 +887,7 @@ Datum string_pad_blank(Datum *params, uint64_t size) {
}

int32_t writeLen = str.length < retByteLen ? str.length : retByteLen;
for (int i = 0; i < writeLen; i++) *ret++ = str.val[i];
for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i];

if (dir == direction::right) {
int32_t remainder = retByteLen - str.length;
Expand Down Expand Up @@ -904,7 +919,7 @@ Datum string_pad_chars(Datum *params, uint64_t size) {
if (strCharLen >= len) {
retByteLen = utfStrPtr.byteLength(len);
} else {
int rem = len - strCharLen;
int32_t rem = len - strCharLen;
while (rem >= filCharLen) {
retByteLen += fil.length;
rem -= filCharLen;
Expand All @@ -922,18 +937,18 @@ Datum string_pad_chars(Datum *params, uint64_t size) {
} else {
while (remainder > 0) {
if (remainder >= filCharLen) {
for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i];
for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i];
} else {
int32_t fillLen = utfFilPtr.byteLength(remainder);
for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i];
for (int32_t i = 0; i < fil.length; i++) *ret++ = str.val[i];
}
remainder -= filCharLen;
}
}
}

int32_t writeLen = str.length < retByteLen ? str.length : retByteLen;
for (int i = 0; i < writeLen; i++) *ret++ = str.val[i];
for (int32_t i = 0; i < writeLen; i++) *ret++ = str.val[i];

if (dir == direction::right) {
int32_t remainder = len - strCharLen;
Expand All @@ -943,10 +958,10 @@ Datum string_pad_chars(Datum *params, uint64_t size) {
} else {
while (remainder > 0) {
if (remainder >= filCharLen) {
for (int i = 0; i < fil.length; i++) *ret++ = fil.val[i];
for (int32_t i = 0; i < fil.length; i++) *ret++ = fil.val[i];
} else {
int32_t fillLen = utfFilPtr.byteLength(remainder);
for (int i = 0; i < fillLen; i++) *ret++ = fil.val[i];
for (int32_t i = 0; i < fillLen; i++) *ret++ = fil.val[i];
}
remainder -= filCharLen;
}
Expand All @@ -966,4 +981,56 @@ Datum string_rpad(Datum *params, uint64_t size) {
return string_pad_chars<direction::right>(params, size);
}

Datum string_translate(Datum *params, uint64_t size) {
auto translate = [](ByteBuffer &buf, text str, text from, text to) {
utf8ptr utfStrPtr(str.val);
utf8ptr utfFromPtr(from.val);
utf8ptr utfToPtr(to.val);
int32_t strCharLen = utfStrPtr.characterLength(str.val + str.length);
int32_t fromCharLen = utfFromPtr.characterLength(from.val + from.length);
int32_t toCharLen = utfToPtr.characterLength(to.val + to.length);
int32_t retByteLen = 0;
int32_t worstLen = strCharLen * 4;

// if (worstLen / 4 != strCharLen) {
// it won't appear one number which has int32_t length;
// LOG_ERROR(ERRCODE_PROGRAM_LIMIT_EXCEEDED,
// "requested length too large");
// }

buf.resize(buf.size() + worstLen);
char *ret = const_cast<char *>(buf.tail() - worstLen);

auto writeByte = [&](utf8ptr src) {
char *tmp = src.get();
int32_t len = utf8_mblen(tmp);
retByteLen += len;
for (int32_t k = 0; k < len; k++) *ret++ = *tmp++;
};

for (int32_t i = 0; i < strCharLen; i++) {
int32_t j = 0;
utfFromPtr = from.val;
utfToPtr = to.val;
for (; j < fromCharLen; j++) {
if (utfStrPtr == utfFromPtr) {
if (j < toCharLen) {
utfToPtr += j;
writeByte(utfToPtr);
}
break;
}
++utfFromPtr;
}
if (j == fromCharLen) {
writeByte(utfStrPtr);
}
++utfStrPtr;
}
buf.resize(buf.size() - (worstLen - retByteLen));
return text(nullptr, retByteLen);
};
return three_params_bind<text, text, text, text>(params, size, translate);
}

} // namespace dbcommon
66 changes: 65 additions & 1 deletion depends/dbcommon/test/unit/function/test-string-function.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1575,5 +1575,69 @@ INSTANTIATE_TEST_CASE_P(
TestFunctionEntry{FuncKind::STRING_RPAD_NOFILL,
"Vector: NULL NULL NULL",
{"Scalar: NULL", "Vector: 4 5 6"}}));

INSTANTIATE_TEST_CASE_P(
string_translate, TestFunction,
::testing::Values(
TestFunctionEntry{
FuncKind::STRING_TRANSLATE,
"Vector: 小1灵b2 真笨 1六六五 6六caa 6六cb诶诶",
{"Vector: 小a灵b通 真厉害 1二3四五 6六c西ff 6六c西ff",
"Vector: a通 厉害 二3四 ff西 f西f", "Vector: 12 笨 六六 a比 诶b"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: 1b2d,1cc1, ,bcbc,NULL,",
{"Vector{delimiter=,}: abcd,abccba,aaaa,bcbc,NULL,",
"Vector{delimiter=,}: ac,ab,a,,x,x",
"Vector{delimiter=,}: 123,1, ,b,y,y"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: 1b2d,12cc21,111,",
{"Vector{delimiter=,}: abcd,abccba,aaa,",
"Vector{delimiter=,}: ac,ab,a,x", "Scalar: 123"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: NULL,NULL,NULL,NULL",
{"Vector{delimiter=,}: abcd,abccba,aaa,",
"Vector{delimiter=,}: ac,ab,a,x", "Scalar: NULL"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: xb3d,b11b, ",
{"Vector{delimiter=,}: abcd,abccba, aaa",
"Scalar: ca", "Vector{delimiter=,}: 3x,1,"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: NULL,NULL,NULL",
{"Vector{delimiter=,}: abc,cdda,NULL", "Scalar: NULL",
"Vector{delimiter=,}: 3x,1,"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: 12cde,12cc21,1 2 c 21",
{"Vector{delimiter=,}: abcde,abccba,a b c ba",
"Scalar: ab", "Scalar: 12"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: NULL,NULL,NULL",
{"Vector{delimiter=,}: abcde,abccba,a b c ba",
"Scalar: NULL", "Scalar: 12"}},
TestFunctionEntry{
FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: 1b22b1,1cc1,1bccb1,bccb,abccba,NULL",
{"Scalar: abccba", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
"Vector{delimiter=,}: 123,1,12,2,1,y"}},
TestFunctionEntry{
FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
{"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
"Vector{delimiter=,}: 123,1,12,2,1,y"}},
TestFunctionEntry{
FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: a2112a,aa,NULL",
{"Scalar: abccba", "Scalar: cb", "Vector{delimiter=,}: 123,,NULL"}},
TestFunctionEntry{FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
{"Scalar: abc", "Scalar: NULL",
"Vector{delimiter=,}: 123,1,12,2,1,y"}},
TestFunctionEntry{
FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: a2112a,ab11ba,abccba,abccba,NULL",
{"Scalar: abccba", "Vector{delimiter=,}: cb,c, ,,NULL",
"Scalar: 123"}},
TestFunctionEntry{
FuncKind::STRING_TRANSLATE,
"Vector{delimiter=,}: NULL,NULL,NULL,NULL,NULL,NULL",
{"Scalar: NULL", "Vector{delimiter=,}: ac,ba,ad,da,,NULL",
"Scalar: 123"}}));
} // namespace dbcommon

0 comments on commit b7f085d

Please sign in to comment.