From a097a5b235136af53c29811ba3c9dc2e6a1f7bc5 Mon Sep 17 00:00:00 2001 From: xxchan Date: Tue, 16 Jul 2024 16:19:18 +0800 Subject: [PATCH 1/2] refactor(types): doc & refine ToBinary/Text --- src/common/src/types/datetime.rs | 42 +---------------------------- src/common/src/types/decimal.rs | 16 +---------- src/common/src/types/interval.rs | 13 --------- src/common/src/types/mod.rs | 22 +++++++++++---- src/common/src/types/serial.rs | 13 +-------- src/common/src/types/timestamptz.rs | 15 +---------- src/common/src/types/to_binary.rs | 32 +++++++++++----------- src/common/src/types/to_text.rs | 33 +++++++++-------------- src/connector/src/parser/mysql.rs | 4 +-- src/utils/pgwire/src/types.rs | 1 + 10 files changed, 53 insertions(+), 138 deletions(-) diff --git a/src/common/src/types/datetime.rs b/src/common/src/types/datetime.rs index bac96b6c1dea5..fac104b3f5aac 100644 --- a/src/common/src/types/datetime.rs +++ b/src/common/src/types/datetime.rs @@ -20,7 +20,7 @@ use std::hash::Hash; use std::io::Write; use std::str::FromStr; -use bytes::{Bytes, BytesMut}; +use bytes::BytesMut; use chrono::{ DateTime, Datelike, Days, Duration, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Weekday, }; @@ -28,7 +28,6 @@ use postgres_types::{accepts, to_sql_checked, FromSql, IsNull, ToSql, Type}; use risingwave_common_estimate_size::ZeroHeapSize; use thiserror::Error; -use super::to_binary::ToBinary; use super::to_text::ToText; use super::{CheckedAdd, DataType, Interval}; use crate::array::{ArrayError, ArrayResult}; @@ -427,45 +426,6 @@ impl ToText for Timestamp { } } -impl ToBinary for Date { - fn to_binary_with_type(&self, ty: &DataType) -> super::to_binary::Result> { - match ty { - super::DataType::Date => { - let mut output = BytesMut::new(); - self.0.to_sql(&Type::ANY, &mut output).unwrap(); - Ok(Some(output.freeze())) - } - _ => unreachable!(), - } - } -} - -impl ToBinary for Time { - fn to_binary_with_type(&self, ty: &DataType) -> super::to_binary::Result> { - match ty { - super::DataType::Time => { - let mut output = BytesMut::new(); - self.0.to_sql(&Type::ANY, &mut output).unwrap(); - Ok(Some(output.freeze())) - } - _ => unreachable!(), - } - } -} - -impl ToBinary for Timestamp { - fn to_binary_with_type(&self, ty: &DataType) -> super::to_binary::Result> { - match ty { - super::DataType::Timestamp => { - let mut output = BytesMut::new(); - self.0.to_sql(&Type::ANY, &mut output).unwrap(); - Ok(Some(output.freeze())) - } - _ => unreachable!(), - } - } -} - impl Date { pub fn with_days(days: i32) -> Result { Ok(Date::new( diff --git a/src/common/src/types/decimal.rs b/src/common/src/types/decimal.rs index b38dbb4822e68..9523157239e2e 100644 --- a/src/common/src/types/decimal.rs +++ b/src/common/src/types/decimal.rs @@ -17,7 +17,7 @@ use std::io::{Cursor, Read, Write}; use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; use byteorder::{BigEndian, ReadBytesExt}; -use bytes::{BufMut, Bytes, BytesMut}; +use bytes::{BufMut, BytesMut}; use num_traits::{ CheckedAdd, CheckedDiv, CheckedMul, CheckedNeg, CheckedRem, CheckedSub, Num, One, Zero, }; @@ -26,7 +26,6 @@ use risingwave_common_estimate_size::ZeroHeapSize; use rust_decimal::prelude::FromStr; use rust_decimal::{Decimal as RustDecimal, Error, MathematicalOps as _, RoundingStrategy}; -use super::to_binary::ToBinary; use super::to_text::ToText; use super::DataType; use crate::array::ArrayResult; @@ -90,19 +89,6 @@ impl Decimal { } } -impl ToBinary for Decimal { - fn to_binary_with_type(&self, ty: &DataType) -> super::to_binary::Result> { - match ty { - DataType::Decimal => { - let mut output = BytesMut::new(); - self.to_sql(&Type::NUMERIC, &mut output).unwrap(); - Ok(Some(output.freeze())) - } - _ => unreachable!(), - } - } -} - impl ToSql for Decimal { accepts!(NUMERIC); diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 495561561e937..d446669d4a949 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -1181,19 +1181,6 @@ impl<'a> FromSql<'a> for Interval { } } -impl ToBinary for Interval { - fn to_binary_with_type(&self, ty: &DataType) -> super::to_binary::Result> { - match ty { - DataType::Interval => { - let mut output = BytesMut::new(); - self.to_sql(&Type::ANY, &mut output).unwrap(); - Ok(Some(output.freeze())) - } - _ => unreachable!(), - } - } -} - #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum DateTimeField { Year, diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs index 91bebde846f0b..ed424b4f4956c 100644 --- a/src/common/src/types/mod.rs +++ b/src/common/src/types/mod.rs @@ -89,7 +89,6 @@ pub use self::serial::Serial; pub use self::struct_type::StructType; pub use self::successor::Successor; pub use self::timestamptz::*; -pub use self::to_binary::ToBinary; pub use self::to_text::ToText; pub use self::with_data_type::WithDataType; @@ -505,6 +504,10 @@ macro_rules! scalar_impl_enum { ($( { $variant_name:ident, $suffix_name:ident, $scalar:ty, $scalar_ref:ty } ),*) => { /// `ScalarImpl` embeds all possible scalars in the evaluation framework. /// + /// Note: `ScalarImpl` doesn't contain all information of its `DataType`, + /// so sometimes they need to be used together. + /// e.g., for `Struct`, we don't have the field names in the value. + /// /// See `for_all_variants` for the definition. #[derive(Debug, Clone, PartialEq, Eq, EstimateSize)] pub enum ScalarImpl { @@ -513,6 +516,12 @@ macro_rules! scalar_impl_enum { /// `ScalarRefImpl` embeds all possible scalar references in the evaluation /// framework. + /// + /// Note: `ScalarRefImpl` doesn't contain all information of its `DataType`, + /// so sometimes they need to be used together. + /// e.g., for `Struct`, we don't have the field names in the value. + /// + /// See `for_all_variants` for the definition. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum ScalarRefImpl<'scalar> { $( $variant_name($scalar_ref) ),* @@ -791,7 +800,9 @@ impl From for ScalarImpl { } impl ScalarImpl { - /// Creates a scalar from binary. + /// Creates a scalar from pgwire "BINARY" format. + /// + /// The counterpart of [`to_binary::ToBinary`]. pub fn from_binary(bytes: &Bytes, data_type: &DataType) -> Result { let res = match data_type { DataType::Varchar => Self::Utf8(String::from_sql(&Type::VARCHAR, bytes)?.into()), @@ -827,7 +838,9 @@ impl ScalarImpl { Ok(res) } - /// Creates a scalar from text. + /// Creates a scalar from pgwire "TEXT" format. + /// + /// The counterpart of [`ToText`]. pub fn from_text(s: &str, data_type: &DataType) -> Result { Ok(match data_type { DataType::Boolean => str_to_bool(s)?.into(), @@ -908,9 +921,8 @@ pub fn hash_datum(datum: impl ToDatumRef, state: &mut impl std::hash::Hasher) { } impl ScalarRefImpl<'_> { - /// Encode the scalar to postgresql binary format. - /// The encoder implements encoding using pub fn binary_format(&self, data_type: &DataType) -> to_binary::Result { + use self::to_binary::ToBinary; self.to_binary_with_type(data_type).transpose().unwrap() } diff --git a/src/common/src/types/serial.rs b/src/common/src/types/serial.rs index 5f4ba237ee300..c36fb1eb11ff2 100644 --- a/src/common/src/types/serial.rs +++ b/src/common/src/types/serial.rs @@ -24,7 +24,7 @@ use crate::util::row_id::RowId; // Serial is an alias for i64 #[derive(Debug, Copy, Clone, PartialEq, Eq, Ord, PartialOrd, Default, Hash)] -pub struct Serial(i64); +pub struct Serial(pub(crate) i64); impl From for i64 { fn from(value: Serial) -> i64 { @@ -75,17 +75,6 @@ impl crate::types::to_text::ToText for Serial { } } -impl crate::types::to_binary::ToBinary for Serial { - fn to_binary_with_type( - &self, - _ty: &crate::types::DataType, - ) -> super::to_binary::Result> { - let mut output = bytes::BytesMut::new(); - self.0.to_sql(&Type::ANY, &mut output).unwrap(); - Ok(Some(output.freeze())) - } -} - impl ToSql for Serial { accepts!(INT8); diff --git a/src/common/src/types/timestamptz.rs b/src/common/src/types/timestamptz.rs index feafee6e212bf..e5e7cb3c71fdf 100644 --- a/src/common/src/types/timestamptz.rs +++ b/src/common/src/types/timestamptz.rs @@ -16,14 +16,13 @@ use std::error::Error; use std::io::Write; use std::str::FromStr; -use bytes::{Bytes, BytesMut}; +use bytes::BytesMut; use chrono::{DateTime, Datelike, TimeZone, Utc}; use chrono_tz::Tz; use postgres_types::{accepts, to_sql_checked, FromSql, IsNull, ToSql, Type}; use risingwave_common_estimate_size::ZeroHeapSize; use serde::{Deserialize, Serialize}; -use super::to_binary::ToBinary; use super::to_text::ToText; use super::DataType; use crate::array::ArrayResult; @@ -65,18 +64,6 @@ impl<'a> FromSql<'a> for Timestamptz { } } -impl ToBinary for Timestamptz { - fn to_binary_with_type(&self, _ty: &DataType) -> super::to_binary::Result> { - let instant = self.to_datetime_utc(); - let mut out = BytesMut::new(); - // postgres_types::Type::ANY is only used as a placeholder. - instant - .to_sql(&postgres_types::Type::ANY, &mut out) - .unwrap(); - Ok(Some(out.freeze())) - } -} - impl ToText for Timestamptz { fn write(&self, f: &mut W) -> std::fmt::Result { // Just a meaningful representation as placeholder. The real implementation depends diff --git a/src/common/src/types/to_binary.rs b/src/common/src/types/to_binary.rs index 5ab9fd316dcad..56eea301f3f61 100644 --- a/src/common/src/types/to_binary.rs +++ b/src/common/src/types/to_binary.rs @@ -15,7 +15,10 @@ use bytes::{Bytes, BytesMut}; use postgres_types::{ToSql, Type}; -use super::{DataType, DatumRef, ScalarRefImpl, F32, F64}; +use super::{ + DataType, Date, Decimal, Interval, ScalarRefImpl, Serial, Time, Timestamp, Timestamptz, F32, + F64, +}; use crate::error::NotImplemented; /// Error type for [`ToBinary`] trait. @@ -30,14 +33,15 @@ pub enum ToBinaryError { pub type Result = std::result::Result; -// Used to convert ScalarRef to text format +/// Converts `ScalarRef` to pgwire "BINARY" format. +/// +/// [`postgres_types::ToSql`] has similar functionality, and most of our types implement +/// that trait and forward `ToBinary` to it directly. pub trait ToBinary { fn to_binary_with_type(&self, ty: &DataType) -> Result>; } - -// implement use to_sql macro_rules! implement_using_to_sql { - ($({ $scalar_type:ty, $data_type:ident, $accessor:expr } ),*) => { + ($({ $scalar_type:ty, $data_type:ident, $accessor:expr } ),* $(,)?) => { $( impl ToBinary for $scalar_type { fn to_binary_with_type(&self, ty: &DataType) -> Result> { @@ -64,7 +68,14 @@ implement_using_to_sql! { { F32, Float32, |x: &F32| x.0 }, { F64, Float64, |x: &F64| x.0 }, { bool, Boolean, |x| x }, - { &[u8], Bytea, |x| x } + { &[u8], Bytea, |x| x }, + { Time, Time, |x: &Time| x.0 }, + { Date, Date, |x: &Date| x.0 }, + { Timestamp, Timestamp, |x: &Timestamp| x.0 }, + { Decimal, Decimal, |x| x }, + { Interval, Interval, |x| x }, + { Serial, Serial, |x: &Serial| x.0 }, + { Timestamptz, Timestamptz, |x: &Timestamptz| x.to_datetime_utc() } } impl ToBinary for ScalarRefImpl<'_> { @@ -94,12 +105,3 @@ impl ToBinary for ScalarRefImpl<'_> { } } } - -impl ToBinary for DatumRef<'_> { - fn to_binary_with_type(&self, ty: &DataType) -> Result> { - match self { - Some(scalar) => scalar.to_binary_with_type(ty), - None => Ok(None), - } - } -} diff --git a/src/common/src/types/to_text.rs b/src/common/src/types/to_text.rs index ca140b93c37b7..b578656cee444 100644 --- a/src/common/src/types/to_text.rs +++ b/src/common/src/types/to_text.rs @@ -18,7 +18,14 @@ use std::num::FpCategory; use super::{DataType, DatumRef, ScalarRefImpl}; use crate::dispatch_scalar_ref_variants; -// Used to convert ScalarRef to text format +/// Converts `ScalarRef` to text format. +/// This is the implementation for casting to varchar, and pgwire "TEXT" format. +/// +/// For some types, the implementation diverge from Rust's standard `ToString`/`Display`, +/// to match PostgreSQL's representation. +/// +/// FIXME: `ToText` should depend on a lot of other stuff +/// but we have not implemented them yet: timezone, date style, interval style, bytea output, etc pub trait ToText { /// Write the text to the writer *regardless* of its data type /// @@ -39,26 +46,10 @@ pub trait ToText { /// text. E.g. for Int64, it will convert to text as a Int64 type. /// We should prefer to use `to_text_with_type` because it's more clear and readable. /// - /// Following is the relationship between scalar and default type: - /// - `ScalarRefImpl::Int16` -> `DataType::Int16` - /// - `ScalarRefImpl::Int32` -> `DataType::Int32` - /// - `ScalarRefImpl::Int64` -> `DataType::Int64` - /// - `ScalarRefImpl::Int256` -> `DataType::Int256` - /// - `ScalarRefImpl::Float32` -> `DataType::Float32` - /// - `ScalarRefImpl::Float64` -> `DataType::Float64` - /// - `ScalarRefImpl::Decimal` -> `DataType::Decimal` - /// - `ScalarRefImpl::Bool` -> `DataType::Boolean` - /// - `ScalarRefImpl::Utf8` -> `DataType::Varchar` - /// - `ScalarRefImpl::Bytea` -> `DataType::Bytea` - /// - `ScalarRefImpl::Date` -> `DataType::Date` - /// - `ScalarRefImpl::Time` -> `DataType::Time` - /// - `ScalarRefImpl::Timestamp` -> `DataType::Timestamp` - /// - `ScalarRefImpl::Timestamptz` -> `DataType::Timestamptz` - /// - `ScalarRefImpl::Interval` -> `DataType::Interval` - /// - `ScalarRefImpl::Jsonb` -> `DataType::Jsonb` - /// - `ScalarRefImpl::List` -> `DataType::List` - /// - `ScalarRefImpl::Struct` -> `DataType::Struct` - /// - `ScalarRefImpl::Serial` -> `DataType::Serial` + /// Note: currently the `DataType` param is actually unnecessary. + /// Previously, Timestamptz is also represented as int64, and we need the data type to distinguish them. + /// Now we have 1-1 mapping, and it happens to be the case that PostgreSQL default `ToText` format does + /// not need additional metadata like field names contained in `DataType`. fn to_text(&self) -> String { let mut s = String::new(); self.write(&mut s).unwrap(); diff --git a/src/connector/src/parser/mysql.rs b/src/connector/src/parser/mysql.rs index d1df27263e808..a28dddc9aa65a 100644 --- a/src/connector/src/parser/mysql.rs +++ b/src/connector/src/parser/mysql.rs @@ -149,7 +149,7 @@ mod tests { use mysql_async::Row as MySqlRow; use risingwave_common::catalog::{Field, Schema}; use risingwave_common::row::Row; - use risingwave_common::types::{DataType, ToText}; + use risingwave_common::types::DataType; use tokio_stream::StreamExt; use crate::parser::mysql_row_to_owned_row; @@ -187,7 +187,7 @@ mod tests { let d = owned_row.datum_at(2); if let Some(scalar) = d { let v = scalar.into_timestamptz(); - println!("timestamp: {}", v.to_text()); + println!("timestamp: {:?}", v); } } } diff --git a/src/utils/pgwire/src/types.rs b/src/utils/pgwire/src/types.rs index d4d37e1168ea1..c76aa20aac4cd 100644 --- a/src/utils/pgwire/src/types.rs +++ b/src/utils/pgwire/src/types.rs @@ -59,6 +59,7 @@ impl Index for Row { } } +/// #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Format { Binary, From 45df1dac351d69868acc3ea2c47cdbf8e8417eb8 Mon Sep 17 00:00:00 2001 From: xxchan Date: Wed, 17 Jul 2024 04:42:42 +0800 Subject: [PATCH 2/2] refine notes on cast --- src/common/src/types/to_text.rs | 14 ++++++++++++-- src/expr/impl/src/scalar/cast.rs | 6 +++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/common/src/types/to_text.rs b/src/common/src/types/to_text.rs index b578656cee444..166356f1977e8 100644 --- a/src/common/src/types/to_text.rs +++ b/src/common/src/types/to_text.rs @@ -18,12 +18,22 @@ use std::num::FpCategory; use super::{DataType, DatumRef, ScalarRefImpl}; use crate::dispatch_scalar_ref_variants; -/// Converts `ScalarRef` to text format. -/// This is the implementation for casting to varchar, and pgwire "TEXT" format. +/// Converts `ScalarRef` to pgwire "TEXT" format. +/// +/// ## Relationship with casting to varchar +/// +/// For most types, this is also the implementation for casting to varchar, but there are exceptions. +/// e.g., The TEXT format for boolean is `t` / `f` while they cast to varchar `true` / `false`. +/// - +/// - +/// +/// ## Relationship with `ToString`/`Display` /// /// For some types, the implementation diverge from Rust's standard `ToString`/`Display`, /// to match PostgreSQL's representation. /// +/// --- +/// /// FIXME: `ToText` should depend on a lot of other stuff /// but we have not implemented them yet: timezone, date style, interval style, bytea output, etc pub trait ToText { diff --git a/src/expr/impl/src/scalar/cast.rs b/src/expr/impl/src/scalar/cast.rs index 48218d7200809..0c93c0ed15dd9 100644 --- a/src/expr/impl/src/scalar/cast.rs +++ b/src/expr/impl/src/scalar/cast.rs @@ -147,8 +147,8 @@ pub fn int_to_bool(input: i32) -> bool { input != 0 } -// For most of the types, cast them to varchar is similar to return their text format. -// So we use this function to cast type to varchar. +/// For most of the types, cast them to varchar is the same as their pgwire "TEXT" format. +/// So we use `ToText` to cast type to varchar. #[function("cast(*int) -> varchar")] #[function("cast(decimal) -> varchar")] #[function("cast(*float) -> varchar")] @@ -177,7 +177,7 @@ pub fn bool_to_varchar(input: bool, writer: &mut impl Write) { .unwrap(); } -/// `bool_out` is different from `general_to_string` to produce a single char. `PostgreSQL` +/// `bool_out` is different from `cast(boolean) -> varchar` to produce a single char. `PostgreSQL` /// uses different variants of bool-to-string in different situations. #[function("bool_out(boolean) -> varchar")] pub fn bool_out(input: bool, writer: &mut impl Write) {