Skip to content

Commit

Permalink
feat: support convert_from/convert_to function that only support …
Browse files Browse the repository at this point in the history
…UTF8 encoding (#13597)
  • Loading branch information
jetjinser authored Nov 23, 2023
1 parent b1c5ccd commit edd0805
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 1 deletion.
114 changes: 114 additions & 0 deletions e2e_test/batch/functions/encdec.slt.part
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Text/Binary String Conversion Functions

# encode/decode
# escape
query T
SELECT encode('\xc3b1', 'escape');
----
\303\261

query T
SELECT decode('\303\261', 'escape');
----
\xc3b1

query T
SELECT encode('\xe78e8be58695e6adbbe4ba86e788b6e4bab2', 'escape');
----
\347\216\213\345\206\225\346\255\273\344\272\206\347\210\266\344\272\262

query T
SELECT decode('王冕死了父亲', 'escape');
----
\xe78e8be58695e6adbbe4ba86e788b6e4bab2

# base64
query T
SELECT encode('123\000\001', 'base64');
----
MTIzAAE=

query T
SELECT decode('MTIzAAE=', 'base64');
----
\x3132330001

# hex
query T
SELECT encode('joanna', 'hex');
----
6a6f616e6e61

query T
SELECT decode('6a6f616e6e61', 'hex');
----
\x6a6f616e6e61

# convert
# convert_from
query T
SELECT convert_from('\xc3b1', 'UTF8');
----
ñ

query T
SELECT convert_from('\xe78e8be58695e6adbbe4ba86e788b6e4bab2', 'UTF8');
----
王冕死了父亲

query T
SELECT convert_from('\x6a6f616e6e61', 'UTF8');
----
joanna

# UTF[-]8, case-insensitive
query T
SELECT convert_from('\x6a6f616e6e61', 'UTF-8');
----
joanna

query T
SELECT convert_from('\x6a6f616e6e61', 'utf8');
----
joanna

query T
SELECT convert_from('\x6a6f616e6e61', 'utf-8');
----
joanna

query T
SELECT convert_from('\x6a6f616e6e61', 'UtF8');
----
joanna

query T
SELECT convert_from('\x6a6f616e6e61', 'uTF-8');
----
joanna

# convert_to
query T
SELECT convert_to('some_text', 'UTF8');
----
\x736f6d655f74657874

query T
SELECT convert_to('柠檬', 'UTF8');
----
\xe69fa0e6aaac

query T
SELECT convert_to('🍋', 'UTF8');
----
\xf09f8d8b

query T
SELECT convert_from(convert_to('good', 'UTF8'), 'UTF8');
----
good

query T
SELECT convert_from(convert_to('wow🐮', 'UTF8'), 'UTF8');
----
wow🐮
2 changes: 2 additions & 0 deletions proto/expr.proto
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ message ExprNode {
FORMAT = 319;
PGWIRE_SEND = 320;
PGWIRE_RECV = 321;
CONVERT_FROM = 322;
CONVERT_TO = 323;

// Unary operators
NEG = 401;
Expand Down
37 changes: 37 additions & 0 deletions src/expr/impl/src/scalar/encdec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,43 @@ pub fn decode(data: &str, format: &str) -> Result<Box<[u8]>> {
}
}

enum CharacterSet {
Utf8,
}

impl CharacterSet {
fn recognize(encoding: &str) -> Result<Self> {
match encoding.to_uppercase().as_str() {
"UTF8" | "UTF-8" => Ok(Self::Utf8),
_ => Err(ExprError::InvalidParam {
name: "encoding",
reason: format!("unrecognized encoding: \"{}\"", encoding).into(),
}),
}
}
}

#[function("convert_from(bytea, varchar) -> varchar")]
pub fn convert_from(data: &[u8], src_encoding: &str, writer: &mut impl Write) -> Result<()> {
match CharacterSet::recognize(src_encoding)? {
CharacterSet::Utf8 => {
let text = String::from_utf8(data.to_vec()).map_err(|e| ExprError::InvalidParam {
name: "data",
reason: e.to_string().into(),
})?;
writer.write_str(&text).unwrap();
Ok(())
}
}
}

#[function("convert_to(varchar, varchar) -> bytea")]
pub fn convert_to(string: &str, dest_encoding: &str) -> Result<Box<[u8]>> {
match CharacterSet::recognize(dest_encoding)? {
CharacterSet::Utf8 => Ok(string.as_bytes().into()),
}
}

// According to https://www.postgresql.org/docs/current/functions-binarystring.html#ENCODE-FORMAT-BASE64
// We need to split newlines when the output length is greater than or equal to 76
fn encode_bytes_base64(data: &[u8], writer: &mut impl Write) -> Result<()> {
Expand Down
2 changes: 2 additions & 0 deletions src/frontend/src/binder/expr/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,8 @@ impl Binder {
("string_to_array", raw_call(ExprType::StringToArray)),
("encode", raw_call(ExprType::Encode)),
("decode", raw_call(ExprType::Decode)),
("convert_from", raw_call(ExprType::ConvertFrom)),
("convert_to", raw_call(ExprType::ConvertTo)),
("sha1", raw_call(ExprType::Sha1)),
("sha224", raw_call(ExprType::Sha224)),
("sha256", raw_call(ExprType::Sha256)),
Expand Down
4 changes: 3 additions & 1 deletion src/frontend/src/expr/pure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,9 @@ impl ExprVisitor for ImpureAnalyzer {
| expr_node::Type::PgwireRecv
| expr_node::Type::ArrayTransform
| expr_node::Type::Greatest
| expr_node::Type::Least =>
| expr_node::Type::Least
| expr_node::Type::ConvertFrom
| expr_node::Type::ConvertTo =>
// expression output is deterministic(same result for the same input)
{
func_call
Expand Down

0 comments on commit edd0805

Please sign in to comment.