diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala index 5d7bcf324ae8..4a2b7040fa5e 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala @@ -2192,7 +2192,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr } } - test("GLUTEN-3135: Bug fix to_date") { + test("GLUTEN-3135/GLUTEN-7896: Bug fix to_date") { val create_table_sql = """ | create table test_tbl_3135(id bigint, data string) using parquet @@ -2209,13 +2209,27 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr |(7, '1970-01-01 00:00:00'), |(8, '2024-3-2'), |(9, '2024-03-2'), - |(10, '2024-03') + |(10, '2024-03'), + |(11, '2024-03-02 11:22:33') |""".stripMargin spark.sql(create_table_sql) spark.sql(insert_data_sql) val select_sql = "select id, to_date(data) from test_tbl_3135" compareResultsAgainstVanillaSpark(select_sql, true, { _ => }) + + withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "corrected")) { + compareResultsAgainstVanillaSpark( + "select id, to_date('2024-03-2 11:22:33', 'yyyy-MM-dd') from test_tbl_3135 where id = 11", + true, + { _ => }) + } + withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "legacy")) { + compareResultsAgainstVanillaSpark( + "select id, to_date(data, 'yyyy-MM-dd') from test_tbl_3135 where id = 11", + true, + { _ => }) + } spark.sql("drop table test_tbl_3135") } diff --git a/cpp-ch/local-engine/Common/CHUtil.cpp b/cpp-ch/local-engine/Common/CHUtil.cpp index 03df93c851e3..8fef52e50a68 100644 --- a/cpp-ch/local-engine/Common/CHUtil.cpp +++ b/cpp-ch/local-engine/Common/CHUtil.cpp @@ -762,6 +762,11 @@ void BackendInitializerUtil::initSettings(const SparkConfigs::ConfigMap & spark_ settings.set(key, toField(key, value)); LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value); } + else if (key == TIMER_PARSER_POLICY) + { + settings.set(key, value); + LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value); + } } /// Finally apply some fixed kvs to settings. diff --git a/cpp-ch/local-engine/Common/CHUtil.h b/cpp-ch/local-engine/Common/CHUtil.h index cff69090ee31..a5fb24f6afee 100644 --- a/cpp-ch/local-engine/Common/CHUtil.h +++ b/cpp-ch/local-engine/Common/CHUtil.h @@ -40,6 +40,7 @@ namespace local_engine static const String MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE = "mergetree.insert_without_local_storage"; static const String MERGETREE_MERGE_AFTER_INSERT = "mergetree.merge_after_insert"; static const std::string DECIMAL_OPERATIONS_ALLOW_PREC_LOSS = "spark.sql.decimalOperations.allowPrecisionLoss"; +static const std::string TIMER_PARSER_POLICY = "spark.sql.legacy.timeParserPolicy"; static const std::unordered_set BOOL_VALUE_SETTINGS{ MERGETREE_MERGE_AFTER_INSERT, MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE, DECIMAL_OPERATIONS_ALLOW_PREC_LOSS}; diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp index d6584267455f..ec8b4e0d12bf 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp @@ -57,7 +57,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Not, not, not ); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Xor, xor, xor); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Cast, cast, CAST); -REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp, parseDateTime64InJodaSyntaxOrNull); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Quarter, quarter, toQuarter); // math functions diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp new file mode 100644 index 000000000000..4724f820099a --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace local_engine +{ + static FunctionParserRegister register_get_timestamp; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h new file mode 100644 index 000000000000..5e32e00569f1 --- /dev/null +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} +} + + +namespace local_engine +{ +class FunctionParserGetTimestamp : public FunctionParser +{ +public: + explicit FunctionParserGetTimestamp(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {} + ~FunctionParserGetTimestamp() override = default; + + static constexpr auto name = "get_timestamp"; + String getName() const override { return name; } + + const ActionsDAG::Node * parse( + const substrait::Expression_ScalarFunction & substrait_func, + ActionsDAG & actions_dag) const override + { + /* + spark function: get_timestamp(expr, fmt) + 1. If timeParserPolicy is LEGACY + 1) fmt has 0 'S', ch function = parseDateTime64InJodaSyntaxOrNull(substr(expr,1,length(fmt)), fmt); + 2) fmt has 'S' more than 0, make the fmt has 3 'S', ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt) + 2. Else ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt) + */ + auto parsed_args = parseFunctionArguments(substrait_func, actions_dag); + if (parsed_args.size() != 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly two arguments", getName()); + const auto * expr_arg = parsed_args[0]; + const auto * fmt_arg = parsed_args[1]; + + const auto & args = substrait_func.arguments(); + bool fmt_string_literal = args[1].value().has_literal(); + String fmt; + if (fmt_string_literal) + { + const auto & literal_fmt_expr = args[1].value().literal(); + fmt_string_literal = literal_fmt_expr.has_string(); + fmt = fmt_string_literal ? literal_fmt_expr.string() : ""; + } + if (!fmt_string_literal) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The second of function {} must be const String.", name); + + UInt32 s_count = std::count(fmt.begin(), fmt.end(), 'S'); + String time_parser_policy = getContext()->getSettingsRef().has(TIMER_PARSER_POLICY) ? toString(getContext()->getSettingsRef().get(TIMER_PARSER_POLICY)) : ""; + boost::to_lower(time_parser_policy); + if (time_parser_policy == "legacy") + { + if (s_count == 0) + { + const auto * index_begin_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 1); + const auto * index_end_node = addColumnToActionsDAG(actions_dag, std::make_shared(), fmt.size()); + const auto * substr_node = toFunctionNode(actions_dag, "substringUTF8", {expr_arg, index_begin_node, index_end_node}); + const auto * fmt_node = addColumnToActionsDAG(actions_dag, std::make_shared(), fmt); + const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {substr_node, fmt_node}); + return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag); + } + else if (s_count < 3) + fmt += String(3 - s_count, 'S'); + else + fmt = fmt.substr(0, fmt.size() - (s_count - 3)); + + const auto * fmt_node = addColumnToActionsDAG(actions_dag, std::make_shared(), fmt); + const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_node}); + return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag); + } + else + { + const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg}); + return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag); + } + } +}; +} diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp index 622237da9707..33997734c5e4 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp @@ -17,7 +17,7 @@ #include #include - +#include namespace DB { @@ -34,10 +34,10 @@ namespace local_engine { template -class FunctionParserUnixTimestamp : public FunctionParser +class FunctionParserUnixTimestamp : public FunctionParserGetTimestamp { public: - explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {} + explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : FunctionParserGetTimestamp(parser_context_) {} ~FunctionParserUnixTimestamp() override = default; static constexpr auto name = Name::name; @@ -60,13 +60,13 @@ class FunctionParserUnixTimestamp : public FunctionParser const auto * expr_arg = parsed_args[0]; const auto * fmt_arg = parsed_args[1]; auto expr_type = removeNullable(expr_arg->result_type); + if (isString(expr_type)) + return FunctionParserGetTimestamp::parse(substrait_func, actions_dag); + const DateLUTImpl * date_lut = &DateLUT::instance(); const auto * time_zone_node = addColumnToActionsDAG(actions_dag, std::make_shared(), date_lut->getTimeZone()); - const DB::ActionsDAG::Node * result_node = nullptr; - if (isString(expr_type)) - result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg, time_zone_node}); - else if (isDateOrDate32(expr_type)) + if (isDateOrDate32(expr_type)) result_node = toFunctionNode(actions_dag, "sparkDateToUnixTimestamp", {expr_arg, time_zone_node}); else if (isDateTime(expr_type) || isDateTime64(expr_type)) result_node = toFunctionNode(actions_dag, "toUnixTimestamp", {expr_arg, time_zone_node}); diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala index e0d06ce6fc0f..2ccdcae99b5c 100644 --- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala @@ -805,7 +805,8 @@ object GlutenConfig { SPARK_OFFHEAP_ENABLED, SESSION_LOCAL_TIMEZONE.key, DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key, - SPARK_REDACTION_REGEX + SPARK_REDACTION_REGEX, + LEGACY_TIME_PARSER_POLICY.key ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava)