Skip to content

Commit

Permalink
[GLUTEN-7896][CH]Fix to_date diff for time parser policy config (#7923)
Browse files Browse the repository at this point in the history
* fix pre-projection not take effect

* Fix time_parser_plicy set legacy

* fix

* fix 11

* add test

* fix ci test

* Fix code bug

* fix review

* modify test
  • Loading branch information
KevinyhZou authored Nov 26, 2024
1 parent 4dfdfd7 commit efd2cbd
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2192,7 +2192,7 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
}
}

test("GLUTEN-3135: Bug fix to_date") {
test("GLUTEN-3135/GLUTEN-7896: Bug fix to_date") {
val create_table_sql =
"""
| create table test_tbl_3135(id bigint, data string) using parquet
Expand All @@ -2209,13 +2209,27 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr
|(7, '1970-01-01 00:00:00'),
|(8, '2024-3-2'),
|(9, '2024-03-2'),
|(10, '2024-03')
|(10, '2024-03'),
|(11, '2024-03-02 11:22:33')
|""".stripMargin
spark.sql(create_table_sql)
spark.sql(insert_data_sql)

val select_sql = "select id, to_date(data) from test_tbl_3135"
compareResultsAgainstVanillaSpark(select_sql, true, { _ => })

withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "corrected")) {
compareResultsAgainstVanillaSpark(
"select id, to_date('2024-03-2 11:22:33', 'yyyy-MM-dd') from test_tbl_3135 where id = 11",
true,
{ _ => })
}
withSQLConf(("spark.sql.legacy.timeParserPolicy" -> "legacy")) {
compareResultsAgainstVanillaSpark(
"select id, to_date(data, 'yyyy-MM-dd') from test_tbl_3135 where id = 11",
true,
{ _ => })
}
spark.sql("drop table test_tbl_3135")
}

Expand Down
5 changes: 5 additions & 0 deletions cpp-ch/local-engine/Common/CHUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,11 @@ void BackendInitializerUtil::initSettings(const SparkConfigs::ConfigMap & spark_
settings.set(key, toField(key, value));
LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value);
}
else if (key == TIMER_PARSER_POLICY)
{
settings.set(key, value);
LOG_DEBUG(&Poco::Logger::get("CHUtil"), "Set settings key:{} value:{}", key, value);
}
}

/// Finally apply some fixed kvs to settings.
Expand Down
1 change: 1 addition & 0 deletions cpp-ch/local-engine/Common/CHUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace local_engine
static const String MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE = "mergetree.insert_without_local_storage";
static const String MERGETREE_MERGE_AFTER_INSERT = "mergetree.merge_after_insert";
static const std::string DECIMAL_OPERATIONS_ALLOW_PREC_LOSS = "spark.sql.decimalOperations.allowPrecisionLoss";
static const std::string TIMER_PARSER_POLICY = "spark.sql.legacy.timeParserPolicy";

static const std::unordered_set<String> BOOL_VALUE_SETTINGS{
MERGETREE_MERGE_AFTER_INSERT, MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE, DECIMAL_OPERATIONS_ALLOW_PREC_LOSS};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Not, not, not );
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Xor, xor, xor);

REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Cast, cast, CAST);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp, parseDateTime64InJodaSyntaxOrNull);
REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Quarter, quarter, toQuarter);

// math functions
Expand Down
23 changes: 23 additions & 0 deletions cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <Parser/scalar_function_parser/getTimestamp.h>

namespace local_engine
{
static FunctionParserRegister<FunctionParserGetTimestamp> register_get_timestamp;
}
106 changes: 106 additions & 0 deletions cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <DataTypes/DataTypeNullable.h>
#include <Parser/FunctionParser.h>
#include <Core/Settings.h>
#include <Core/Field.h>
#include <Common/CHUtil.h>
#include <boost/algorithm/string/case_conv.hpp>

namespace DB
{

namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
}


namespace local_engine
{
class FunctionParserGetTimestamp : public FunctionParser
{
public:
explicit FunctionParserGetTimestamp(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {}
~FunctionParserGetTimestamp() override = default;

static constexpr auto name = "get_timestamp";
String getName() const override { return name; }

const ActionsDAG::Node * parse(
const substrait::Expression_ScalarFunction & substrait_func,
ActionsDAG & actions_dag) const override
{
/*
spark function: get_timestamp(expr, fmt)
1. If timeParserPolicy is LEGACY
1) fmt has 0 'S', ch function = parseDateTime64InJodaSyntaxOrNull(substr(expr,1,length(fmt)), fmt);
2) fmt has 'S' more than 0, make the fmt has 3 'S', ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt)
2. Else ch function = parseDateTime64InJodaSyntaxOrNull(expr, fmt)
*/
auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
if (parsed_args.size() != 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly two arguments", getName());
const auto * expr_arg = parsed_args[0];
const auto * fmt_arg = parsed_args[1];

const auto & args = substrait_func.arguments();
bool fmt_string_literal = args[1].value().has_literal();
String fmt;
if (fmt_string_literal)
{
const auto & literal_fmt_expr = args[1].value().literal();
fmt_string_literal = literal_fmt_expr.has_string();
fmt = fmt_string_literal ? literal_fmt_expr.string() : "";
}
if (!fmt_string_literal)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The second of function {} must be const String.", name);

UInt32 s_count = std::count(fmt.begin(), fmt.end(), 'S');
String time_parser_policy = getContext()->getSettingsRef().has(TIMER_PARSER_POLICY) ? toString(getContext()->getSettingsRef().get(TIMER_PARSER_POLICY)) : "";
boost::to_lower(time_parser_policy);
if (time_parser_policy == "legacy")
{
if (s_count == 0)
{
const auto * index_begin_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(), 1);
const auto * index_end_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeUInt64>(), fmt.size());
const auto * substr_node = toFunctionNode(actions_dag, "substringUTF8", {expr_arg, index_begin_node, index_end_node});
const auto * fmt_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), fmt);
const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {substr_node, fmt_node});
return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);
}
else if (s_count < 3)
fmt += String(3 - s_count, 'S');
else
fmt = fmt.substr(0, fmt.size() - (s_count - 3));

const auto * fmt_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), fmt);
const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_node});
return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);
}
else
{
const auto * result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg});
return convertNodeTypeIfNeeded(substrait_func, result_node, actions_dag);
}
}
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include <DataTypes/DataTypeNullable.h>
#include <Parser/FunctionParser.h>

#include <Parser/scalar_function_parser/getTimestamp.h>

namespace DB
{
Expand All @@ -34,10 +34,10 @@ namespace local_engine
{

template<typename Name>
class FunctionParserUnixTimestamp : public FunctionParser
class FunctionParserUnixTimestamp : public FunctionParserGetTimestamp
{
public:
explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : FunctionParser(parser_context_) {}
explicit FunctionParserUnixTimestamp(ParserContextPtr parser_context_) : FunctionParserGetTimestamp(parser_context_) {}
~FunctionParserUnixTimestamp() override = default;

static constexpr auto name = Name::name;
Expand All @@ -60,13 +60,13 @@ class FunctionParserUnixTimestamp : public FunctionParser
const auto * expr_arg = parsed_args[0];
const auto * fmt_arg = parsed_args[1];
auto expr_type = removeNullable(expr_arg->result_type);
if (isString(expr_type))
return FunctionParserGetTimestamp::parse(substrait_func, actions_dag);

const DateLUTImpl * date_lut = &DateLUT::instance();
const auto * time_zone_node = addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), date_lut->getTimeZone());

const DB::ActionsDAG::Node * result_node = nullptr;
if (isString(expr_type))
result_node = toFunctionNode(actions_dag, "parseDateTime64InJodaSyntaxOrNull", {expr_arg, fmt_arg, time_zone_node});
else if (isDateOrDate32(expr_type))
if (isDateOrDate32(expr_type))
result_node = toFunctionNode(actions_dag, "sparkDateToUnixTimestamp", {expr_arg, time_zone_node});
else if (isDateTime(expr_type) || isDateTime64(expr_type))
result_node = toFunctionNode(actions_dag, "toUnixTimestamp", {expr_arg, time_zone_node});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,8 @@ object GlutenConfig {
SPARK_OFFHEAP_ENABLED,
SESSION_LOCAL_TIMEZONE.key,
DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key,
SPARK_REDACTION_REGEX
SPARK_REDACTION_REGEX,
LEGACY_TIME_PARSER_POLICY.key
)
nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava)

Expand Down

0 comments on commit efd2cbd

Please sign in to comment.