From aa33f2f05698a94ff1d70dba6eb5726b413d5a35 Mon Sep 17 00:00:00 2001
From: xxchan <xxchan22f@gmail.com>
Date: Fri, 26 Jul 2024 21:06:27 +0800
Subject: [PATCH] feat: support create table with map type

---
 e2e_test/batch/distribution_mode.slt     |  2 +-
 e2e_test/batch/local_mode.slt            |  2 +-
 e2e_test/batch/types/list.slt.part       |  2 -
 e2e_test/batch/types/map.slt.part        | 57 ++++++++++++++++++++++++
 e2e_test/batch/types/struct.slt.part     |  1 -
 src/common/src/types/map_type.rs         |  9 ++--
 src/frontend/src/binder/expr/mod.rs      |  7 ++-
 src/frontend/src/binder/insert.rs        | 17 +++++--
 src/frontend/src/binder/select.rs        |  4 +-
 src/frontend/src/handler/create_mv.rs    |  2 +-
 src/sqlparser/src/ast/data_type.rs       |  5 +++
 src/sqlparser/src/parser.rs              |  3 +-
 src/sqlparser/src/parser_v2/data_type.rs | 57 ++++++++++++++++--------
 13 files changed, 130 insertions(+), 38 deletions(-)
 delete mode 100644 e2e_test/batch/types/list.slt.part
 create mode 100644 e2e_test/batch/types/map.slt.part
 delete mode 100644 e2e_test/batch/types/struct.slt.part
diff --git a/e2e_test/batch/distribution_mode.slt b/e2e_test/batch/distribution_mode.slt
index c125000a3acd4..b680796277c13 100644
--- a/e2e_test/batch/distribution_mode.slt
+++ b/e2e_test/batch/distribution_mode.slt
@@ -4,7 +4,7 @@ SET RW_IMPLICIT_FLUSH TO true;
 statement ok
 SET QUERY_MODE TO distributed;
 
-include ./basic/*.slt.part
+include ./basic/**/*.slt.part
 include ./duckdb/all.slt.part
 include ./order/*.slt.part
 include ./join/*.slt.part
diff --git a/e2e_test/batch/local_mode.slt b/e2e_test/batch/local_mode.slt
index c3818989443a7..68df9f0d91950 100644
--- a/e2e_test/batch/local_mode.slt
+++ b/e2e_test/batch/local_mode.slt
@@ -4,7 +4,7 @@ SET RW_IMPLICIT_FLUSH TO true;
 statement ok
 SET QUERY_MODE TO local;
 
-include ./basic/*.slt.part
+include ./basic/**/*.slt.part
 include ./duckdb/all.slt.part
 include ./order/*.slt.part
 include ./join/*.slt.part
diff --git a/e2e_test/batch/types/list.slt.part b/e2e_test/batch/types/list.slt.part
deleted file mode 100644
index 031a466a5a3b2..0000000000000
--- a/e2e_test/batch/types/list.slt.part
+++ /dev/null
@@ -1,2 +0,0 @@
-# Test cases for list don't work for now as the parser cannot recognize the cast expression.
-# include list/*.slt.part
diff --git a/e2e_test/batch/types/map.slt.part b/e2e_test/batch/types/map.slt.part
new file mode 100644
index 0000000000000..d8af2b5c87f82
--- /dev/null
+++ b/e2e_test/batch/types/map.slt.part
@@ -0,0 +1,57 @@
+statement ok
+SET RW_IMPLICIT_FLUSH TO true;
+
+
+statement error
+create table t (m map (float, float));
+----
+db error: ERROR: Failed to run the query
+
+Caused by:
+  invalid map key type: double precision
+
+
+statement ok
+create table t (
+    m1 map(varchar, float),
+    m2 map(int, bool),
+    m3 map(varchar, map(varchar, varchar)),
+    l map(varchar,int)[],
+    s struct<m map(varchar, struct<x int>)>,
+);
+
+
+statement ok
+insert into t values (
+    map_from_entries(array['a','b','c'], array[1.0,2.0,3.0]::float[]),
+    map_from_entries(array[1,2,3], array[true,false,true]),
+    map_from_entries(array['a','b'],
+        array[
+            map_from_entries(array['a1'], array['a2']),
+            map_from_entries(array['b1'], array['b2'])
+        ]
+    ),
+    array[
+        map_from_entries(array['a','b','c'], array[1,2,3]),
+        map_from_entries(array['d','e','f'], array[4,5,6])
+    ],
+    row(
+        map_from_entries(array['a','b','c'], array[row(1),row(2),row(3)]::struct<x int>[])
+    )
+);
+
+query ?????
+select * from t;
+----
+{"a":1,"b":2,"c":3}	{"1":t,"2":f,"3":t}	{"a":{"a1":a2},"b":{"b1":b2}}	{"{\"a\":1,\"b\":2,\"c\":3}","{\"d\":4,\"e\":5,\"f\":6}"}	("{""a"":(1),""b"":(2),""c"":(3)}")
+
+
+# FIXME: The cast should be supported
+statement error
+insert into t(m1) values (map_from_entries(array['a','b','c'], array[1,2,3]));
+----
+db error: ERROR: Failed to run the query
+
+Caused by these errors (recent errors listed first):
+  1: failed to cast the 1st column
+  2: cannot cast type "map(character varying,integer)" to "map(character varying,double precision)" in Assign context
diff --git a/e2e_test/batch/types/struct.slt.part b/e2e_test/batch/types/struct.slt.part
deleted file mode 100644
index 396881000d48a..0000000000000
--- a/e2e_test/batch/types/struct.slt.part
+++ /dev/null
@@ -1 +0,0 @@
-include struct/*.slt.part
diff --git a/src/common/src/types/map_type.rs b/src/common/src/types/map_type.rs
index 096fb1c7b310f..11600bb45b42d 100644
--- a/src/common/src/types/map_type.rs
+++ b/src/common/src/types/map_type.rs
@@ -50,10 +50,11 @@ impl MapType {
             .iter()
             .collect_tuple()
             .expect("the underlying struct for map must have exactly two fields");
-        if cfg!(debug_assertions) {
-            // the field names are not strictly enforced
-            itertools::assert_equal(struct_type.names(), ["key", "value"]);
-        }
+        // the field names are not strictly enforced
+        // Currently this panics for SELECT * FROM t
+        // if cfg!(debug_assertions) {
+        //     itertools::assert_equal(struct_type.names(), ["key", "value"]);
+        // }
         Self::from_kv(k.1.clone(), v.1.clone())
     }
 
diff --git a/src/frontend/src/binder/expr/mod.rs b/src/frontend/src/binder/expr/mod.rs
index 308f34a31d447..d11134b097e3b 100644
--- a/src/frontend/src/binder/expr/mod.rs
+++ b/src/frontend/src/binder/expr/mod.rs
@@ -14,7 +14,7 @@
 
 use itertools::Itertools;
 use risingwave_common::catalog::{ColumnDesc, ColumnId, PG_CATALOG_SCHEMA_NAME};
-use risingwave_common::types::DataType;
+use risingwave_common::types::{DataType, MapType};
 use risingwave_common::util::iter_util::zip_eq_fast;
 use risingwave_common::{bail_no_function, bail_not_implemented, not_implemented};
 use risingwave_pb::plan_common::{AdditionalColumn, ColumnDescVersion};
@@ -999,6 +999,11 @@ pub fn bind_data_type(data_type: &AstDataType) -> Result<DataType> {
                 .collect::<Result<Vec<_>>>()?,
             types.iter().map(|f| f.name.real_value()).collect_vec(),
         ),
+        AstDataType::Map(kv) => {
+            let key = bind_data_type(&kv.0)?;
+            let value = bind_data_type(&kv.1)?;
+            DataType::Map(MapType::try_from_kv(key, value)?)
+        }
         AstDataType::Custom(qualified_type_name) => {
             let idents = qualified_type_name
                 .0
diff --git a/src/frontend/src/binder/insert.rs b/src/frontend/src/binder/insert.rs
index e0b5ce422e75a..8423f02b29397 100644
--- a/src/frontend/src/binder/insert.rs
+++ b/src/frontend/src/binder/insert.rs
@@ -14,6 +14,7 @@
 
 use std::collections::{BTreeMap, HashMap, HashSet};
 
+use anyhow::Context;
 use itertools::Itertools;
 use risingwave_common::catalog::{ColumnCatalog, Schema, TableVersionId};
 use risingwave_common::types::DataType;
@@ -26,6 +27,7 @@ use crate::binder::{Binder, Clause};
 use crate::catalog::TableId;
 use crate::error::{ErrorCode, Result, RwError};
 use crate::expr::{ExprImpl, InputRef};
+use crate::handler::create_mv::ordinal;
 use crate::user::UserId;
 
 #[derive(Debug, Clone)]
@@ -197,7 +199,7 @@ impl Binder {
         let bound_query;
         let cast_exprs;
 
-        let bounded_column_nums = match source.as_simple_values() {
+        let bound_column_nums = match source.as_simple_values() {
             None => {
                 bound_query = self.bind_query(source)?;
                 let actual_types = bound_query.data_types();
@@ -234,7 +236,7 @@ impl Binder {
             cols_to_insert_in_table.len()
         };
 
-        let (err_msg, default_column_indices) = match num_target_cols.cmp(&bounded_column_nums) {
+        let (err_msg, default_column_indices) = match num_target_cols.cmp(&bound_column_nums) {
             std::cmp::Ordering::Equal => (None, default_column_indices),
             std::cmp::Ordering::Greater => {
                 if has_user_specified_columns {
@@ -248,7 +250,7 @@ impl Binder {
                     //      insert into t values (7)
                     // this kind of usage is fine, null values will be provided
                     // implicitly.
-                    (None, col_indices_to_insert.split_off(bounded_column_nums))
+                    (None, col_indices_to_insert.split_off(bound_column_nums))
                 }
             }
             std::cmp::Ordering::Less => {
@@ -315,7 +317,14 @@ impl Binder {
                 return exprs
                     .into_iter()
                     .zip_eq_fast(expected_types.iter().take(expr_num))
-                    .map(|(e, t)| e.cast_assign(t.clone()).map_err(Into::into))
+                    .enumerate()
+                    .map(|(i, (e, t))| {
+                        e.cast_assign(t.clone())
+                            .with_context(|| {
+                                format!("failed to cast the {} column", ordinal(i + 1))
+                            })
+                            .map_err(Into::into)
+                    })
                     .try_collect();
             }
         };
diff --git a/src/frontend/src/binder/select.rs b/src/frontend/src/binder/select.rs
index e15a9eac73246..183e4a13a5a58 100644
--- a/src/frontend/src/binder/select.rs
+++ b/src/frontend/src/binder/select.rs
@@ -714,8 +714,8 @@ fn data_type_to_alias(data_type: &AstDataType) -> Option<String> {
         AstDataType::Jsonb => "jsonb".to_string(),
         AstDataType::Array(ty) => return data_type_to_alias(ty),
         AstDataType::Custom(ty) => format!("{}", ty),
-        AstDataType::Struct(_) => {
-            // Note: Postgres doesn't have anonymous structs
+        AstDataType::Struct(_) | AstDataType::Map(_) => {
+            // Note: Postgres doesn't have maps and anonymous structs
             return None;
         }
     };
diff --git a/src/frontend/src/handler/create_mv.rs b/src/frontend/src/handler/create_mv.rs
index 0eabe34696150..a834173cfd36f 100644
--- a/src/frontend/src/handler/create_mv.rs
+++ b/src/frontend/src/handler/create_mv.rs
@@ -278,7 +278,7 @@ It only indicates the physical clustering of the data, which may improve the per
     ))
 }
 
-fn ordinal(i: usize) -> String {
+pub fn ordinal(i: usize) -> String {
     let s = i.to_string();
     let suffix = if s.ends_with('1') && !s.ends_with("11") {
         "st"
diff --git a/src/sqlparser/src/ast/data_type.rs b/src/sqlparser/src/ast/data_type.rs
index 1e588955f093b..96c67659beb4f 100644
--- a/src/sqlparser/src/ast/data_type.rs
+++ b/src/sqlparser/src/ast/data_type.rs
@@ -70,6 +70,8 @@ pub enum DataType {
     Array(Box<DataType>),
     /// Structs
     Struct(Vec<StructField>),
+    /// Map(key_type, value_type)
+    Map(Box<(DataType, DataType)>),
 }
 
 impl fmt::Display for DataType {
@@ -110,6 +112,9 @@ impl fmt::Display for DataType {
             DataType::Struct(defs) => {
                 write!(f, "STRUCT<{}>", display_comma_separated(defs))
             }
+            DataType::Map(kv) => {
+                write!(f, "MAP({},{})", kv.0, kv.1)
+            }
         }
     }
 }
diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs
index d683bf39f5d45..ce4e34be93333 100644
--- a/src/sqlparser/src/parser.rs
+++ b/src/sqlparser/src/parser.rs
@@ -3632,8 +3632,7 @@ impl Parser<'_> {
         .parse_next(self)
     }
 
-    /// Parse a SQL datatype (in the context of a CREATE TABLE statement for example) and convert
-    /// into an array of that datatype if needed
+    /// Parse a SQL datatype (in the context of a CREATE TABLE statement for example)
     pub fn parse_data_type(&mut self) -> PResult<DataType> {
         parser_v2::data_type(self)
     }
diff --git a/src/sqlparser/src/parser_v2/data_type.rs b/src/sqlparser/src/parser_v2/data_type.rs
index 2a66344fb0357..2544e54d1d542 100644
--- a/src/sqlparser/src/parser_v2/data_type.rs
+++ b/src/sqlparser/src/parser_v2/data_type.rs
@@ -115,6 +115,11 @@ where
 /// Consume a data type definition.
 ///
 /// The parser is the main entry point for data type parsing.
+///
+/// Note: in recursion, we should use `data_type_stateful` instead of `data_type`,
+/// otherwise the type parameter will recurse like `Stateful<Stateful<Stateful<...>>>`.
+/// Also note that we cannot use `Parser<'_>` directly to avoid misuse, because we need
+/// generics `<S>` to parameterize over `Parser<'_>` and `Stateful<Parser<'_>>`.
 pub fn data_type<S>(input: &mut S) -> PResult<DataType>
 where
     S: TokenStream,
@@ -166,6 +171,14 @@ fn data_type_stateful_inner<S>(input: &mut StatefulStream<S>) -> PResult<DataTyp
 where
     S: TokenStream,
 {
+    trace(
+        "data_type_inner",
+        alt((keyword_datatype, non_keyword_datatype)),
+    )
+    .parse_next(input)
+}
+
+fn keyword_datatype<S: TokenStream>(input: &mut StatefulStream<S>) -> PResult<DataType> {
     let with_time_zone = || {
         opt(alt((
             (Keyword::WITH, Keyword::TIME, Keyword::ZONE).value(true),
@@ -186,7 +199,7 @@ where
         })
     };
 
-    let keywords = dispatch! {keyword;
+    let mut ty = dispatch! {keyword;
         Keyword::BOOLEAN | Keyword::BOOL => empty.value(DataType::Boolean),
         Keyword::FLOAT => opt(precision_in_range(1..54)).map(DataType::Float),
         Keyword::REAL => empty.value(DataType::Real),
@@ -211,26 +224,32 @@ where
         Keyword::NUMERIC | Keyword::DECIMAL | Keyword::DEC => cut_err(precision_and_scale()).map(|(precision, scale)| {
             DataType::Decimal(precision, scale)
         }),
-        _ =>  fail,
+        _ =>  fail
     };
 
-    trace(
-        "data_type_inner",
-        alt((
-            keywords,
-            trace(
-                "non_keyword_data_type",
-                object_name.map(
-                    |name| match name.to_string().to_ascii_lowercase().as_str() {
-                        // PostgreSQL built-in data types that are not keywords.
-                        "jsonb" => DataType::Jsonb,
-                        "regclass" => DataType::Regclass,
-                        "regproc" => DataType::Regproc,
-                        _ => DataType::Custom(name),
-                    },
-                ),
-            ),
-        )),
+    ty.parse_next(input)
+}
+
+fn non_keyword_datatype<S: TokenStream>(input: &mut StatefulStream<S>) -> PResult<DataType> {
+    let type_name = object_name.parse_next(input)?;
+    match type_name.to_string().to_ascii_lowercase().as_str() {
+        // PostgreSQL built-in data types that are not keywords.
+        "jsonb" => Ok(DataType::Jsonb),
+        "regclass" => Ok(DataType::Regclass),
+        "regproc" => Ok(DataType::Regproc),
+        "map" => cut_err(map_type_arguments).parse_next(input),
+        _ => Ok(DataType::Custom(type_name)),
+    }
+}
+
+fn map_type_arguments<S: TokenStream>(input: &mut StatefulStream<S>) -> PResult<DataType> {
+    delimited(
+        Token::LParen,
+        // key is string or integral type. value is arbitrary type.
+        // We don't validate here, but in binder bind_data_type
+        seq!(keyword_datatype, _:Token::Comma, data_type_stateful),
+        Token::RParen,
     )
+    .map(|(k, v)| DataType::Map(Box::new((k, v))))
     .parse_next(input)
 }