From cf97edb4b79b90f0f6ff9839c494bbb0e7f5934b Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 26 Dec 2023 13:40:54 -0800 Subject: [PATCH 01/12] Enable ToDate (which is basically a subset of Gpu casting string to timestamp) Signed-off-by: Navin Kumar --- integration_tests/src/main/python/data_gen.py | 4 ++-- integration_tests/src/main/python/date_time_test.py | 9 ++++----- .../src/main/python/window_function_test.py | 1 - .../src/main/scala/com/nvidia/spark/rapids/GpuCast.scala | 7 +++++++ .../main/scala/com/nvidia/spark/rapids/RapidsMeta.scala | 5 +++-- .../sql/catalyst/expressions/rapids/Timestamp.scala | 3 ++- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index dd2819a7832..a36f8b6e637 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -21,10 +21,10 @@ from pyspark.sql.types import * import pyspark.sql.functions as f import random -from spark_session import is_before_spark_340, with_cpu_session +from spark_session import is_before_spark_340 import sre_yield import struct -from conftest import skip_unless_precommit_tests,get_datagen_seed, is_not_utc +from conftest import get_datagen_seed, is_not_utc import time import os from functools import lru_cache diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index b7b95a89b5c..a3f3042f0aa 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -379,6 +379,7 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun, conf=copy_and_update(parser_policy_dic, ansi_enabled_conf)) + @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) @tz_sensitive_test @@ -429,20 +430,18 @@ def test_string_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('data_gen', [StringGen('[0-9]{4}-0[1-9]-[0-2][1-8]')], ids=idfn) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -@allow_non_gpu(*non_utc_allow) -def test_gettimestamp(data_gen, ansi_enabled): +def test_to_date(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), {'spark.sql.ansi.enabled': ansi_enabled}) @pytest.mark.parametrize('data_gen', [StringGen('0[1-9][0-9]{4}')], ids=idfn) -@allow_non_gpu(*non_utc_allow) -def test_gettimestamp_format_MMyyyy(data_gen): +def test_to_date_format_MMyyyy(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) -def test_gettimestamp_ansi_exception(): +def test_to_date_ansi_exception(): assert_gpu_and_cpu_error( lambda spark : invalid_date_string_df(spark).select(f.to_date(f.col("a"), "yyyy-MM-dd")).collect(), error_message="Exception", diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index f9f3b063a97..68c20f79b8f 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -1682,7 +1682,6 @@ def test_window_first_last_nth_ignore_nulls(data_gen): @ignore_order(local=True) -@allow_non_gpu(*non_utc_allow) def test_to_date_with_window_functions(): """ This test ensures that date expressions participating alongside window aggregations diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 6c0bdbfe41e..9dad63232f2 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -86,6 +86,13 @@ abstract class CastExprMetaBase[INPUT <: UnaryExpression with TimeZoneAwareExpre val fromType: DataType = cast.child.dataType val toType: DataType = cast.dataType + override def isTimeZoneSupported: Boolean = { + (fromType, toType) match { + case (TimestampType, DateType) => true // this is for to_date(...) + case _ => false + } + } + override def tagExprForGpu(): Unit = { recursiveTagExprForGpuCheck() } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala index bc73338ec87..f37b59f709a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala @@ -1126,8 +1126,9 @@ abstract class BaseExprMeta[INPUT <: Expression]( if (!isTimeZoneSupported) return checkUTCTimezone(this) // Level 3 check - if (!GpuTimeZoneDB.isSupportedTimeZone(getZoneId())) { - willNotWorkOnGpu(TimeZoneDB.timezoneNotSupportedStr(this.wrapped.getClass.toString)) + val zoneId = getZoneId() + if (!GpuTimeZoneDB.isSupportedTimeZone(zoneId)) { + willNotWorkOnGpu(TimeZoneDB.timezoneNotSupportedStr(zoneId.toString)) } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala index b441627c928..f6be1cca6bc 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala @@ -38,8 +38,9 @@ object TimeStamp { .withPsNote(TypeEnum.STRING, "A limited number of formats are supported"), TypeSig.STRING)), (a, conf, p, r) => new UnixTimeExprMeta[GetTimestamp](a, conf, p, r) { + override def isTimeZoneSupported = true override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = { - GpuGetTimestamp(lhs, rhs, sparkFormat, strfFormat) + GpuGetTimestamp(lhs, rhs, sparkFormat, strfFormat, a.timeZoneId) } }) ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap From e486d15ffef79e7138d40211a2e459875a47c388 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 27 Dec 2023 10:42:18 -0800 Subject: [PATCH 02/12] Add tz_sensitive marker Signed-off-by: Navin Kumar --- integration_tests/src/main/python/date_time_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index a3f3042f0aa..e93307a0605 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -430,6 +430,7 @@ def test_string_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('data_gen', [StringGen('[0-9]{4}-0[1-9]-[0-2][1-8]')], ids=idfn) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@tz_sensitive_test def test_to_date(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), @@ -437,6 +438,7 @@ def test_to_date(data_gen, ansi_enabled): @pytest.mark.parametrize('data_gen', [StringGen('0[1-9][0-9]{4}')], ids=idfn) +@tz_sensitive_test def test_to_date_format_MMyyyy(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) From f834113fc27d4ba7358538beb03b13de7239c762 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 27 Dec 2023 22:38:55 -0800 Subject: [PATCH 03/12] Fix missing imports Signed-off-by: Navin Kumar --- integration_tests/src/main/python/data_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index a36f8b6e637..383a24018af 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -21,10 +21,10 @@ from pyspark.sql.types import * import pyspark.sql.functions as f import random -from spark_session import is_before_spark_340 +from spark_session import is_before_spark_340, with_cpu_session import sre_yield import struct -from conftest import get_datagen_seed, is_not_utc +from conftest import skip_unless_precommit_tests, get_datagen_seed, is_not_utc import time import os from functools import lru_cache From 18cef15598c844b6a9fbd6586518a2bbe20446cd Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 28 Dec 2023 19:09:44 -0800 Subject: [PATCH 04/12] Update GpuCast to handle time zone aware casting of timestamp to date Signed-off-by: Navin Kumar --- integration_tests/src/main/python/cast_test.py | 11 +++++++++-- .../scala/com/nvidia/spark/rapids/GpuCast.scala | 17 ++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index f9c54de3400..a0034460286 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -540,6 +540,13 @@ def test_cast_timestamp_to_string(): lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as string)")) +def test_cast_timestamp_to_date(): + # need to start about 1 day earlier than the max to avoid overflow in cuDF + gen = TimestampGen(end=datetime(9999, 12, 30, 23, 59, 59, 999999, tzinfo=timezone.utc)) + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen) + .selectExpr("cast(a as date)")) + @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') def test_cast_day_time_interval_to_string(): _assert_cast_to_string_equal(DayTimeIntervalGen(start_field='day', end_field='day', special_cases=[MIN_DAY_TIME_INTERVAL, MAX_DAY_TIME_INTERVAL, timedelta(seconds=0)]), {}) @@ -692,9 +699,9 @@ def test_cast_int_to_string_not_UTC(): lambda spark: unary_op_df(spark, int_gen, 100).selectExpr("a", "CAST(a AS STRING) as str"), {"spark.sql.session.timeZone": "+08"}) -not_utc_fallback_test_params = [(timestamp_gen, 'STRING'), (timestamp_gen, 'DATE'), +not_utc_fallback_test_params = [(timestamp_gen, 'STRING'), # python does not like year 0, and with time zones the default start date can become year 0 :( - (DateGen(start=date(1, 1, 3)), 'TIMESTAMP'), + (DateGen(start=date(1, 1, 1)), 'TIMESTAMP'), (SetValuesGen(StringType(), ['2023-03-20 10:38:50', '2023-03-20 10:39:02']), 'TIMESTAMP')] @allow_non_gpu('ProjectExec') diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 9dad63232f2..826115dfa18 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -26,12 +26,13 @@ import ai.rapids.cudf.{BinaryOp, CaptureGroups, ColumnVector, ColumnView, Decima import ai.rapids.cudf import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource} import com.nvidia.spark.rapids.RapidsPluginImplicits._ -import com.nvidia.spark.rapids.jni.CastStrings +import com.nvidia.spark.rapids.jni.{CastStrings, GpuTimeZoneDB} import com.nvidia.spark.rapids.shims.{AnsiUtil, GpuCastShims, GpuIntervalUtils, GpuTypeShims, SparkShimImpl, YearParseUtil} import org.apache.commons.text.StringEscapeUtils import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, NullIntolerant, TimeZoneAwareExpression, UnaryExpression} +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_SECOND import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.GpuToTimestamp.replaceSpecialDates @@ -216,13 +217,16 @@ object CastOptions { * @param ansiMode Whether the cast should be ANSI compliant * @param stringToDateAnsiMode Whether to cast String to Date using ANSI compliance * @param castToJsonString Whether to use JSON format when casting to String + * @param ignoreNullFieldsInStructs Whether to omit null values when converting to JSON + * @param timeZoneId If cast is timezone aware, the timezone needed */ class CastOptions( legacyCastComplexTypesToString: Boolean, ansiMode: Boolean, stringToDateAnsiMode: Boolean, val castToJsonString: Boolean = false, - val ignoreNullFieldsInStructs: Boolean = true) extends Serializable { + val ignoreNullFieldsInStructs: Boolean = true, + val timeZoneId: Option[String] = Option.empty[String]) extends Serializable { /** * Retuns the left bracket to use when surrounding brackets when converting @@ -621,6 +625,12 @@ object GpuCast { case (_: IntegerType | ShortType | ByteType, ym: DataType) if GpuTypeShims.isSupportedYearMonthType(ym) => GpuIntervalUtils.intToYearMonthInterval(input, ym) + case (TimestampType, DateType) if options.timeZoneId.isDefined => + val zoneId = DateTimeUtils.getZoneId(options.timeZoneId.get) + withResource(GpuTimeZoneDB.fromUtcTimestampToTimestamp( input.asInstanceOf[ColumnVector], + zoneId.normalized())) { + shifted => shifted.castTo(GpuColumnVector.getNonNestedRapidsType(toDataType)) + } case _ => input.castTo(GpuColumnVector.getNonNestedRapidsType(toDataType)) } @@ -1814,7 +1824,8 @@ case class GpuCast( import GpuCast._ private val options: CastOptions = - new CastOptions(legacyCastComplexTypesToString, ansiMode, stringToDateAnsiModeEnabled) + new CastOptions(legacyCastComplexTypesToString, ansiMode, stringToDateAnsiModeEnabled, + timeZoneId = timeZoneId) // when ansi mode is enabled, some cast expressions can throw exceptions on invalid inputs override def hasSideEffects: Boolean = super.hasSideEffects || { From db5feacd4621da966f6edd7db3ea2f0bdb63bb5a Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 28 Dec 2023 23:17:21 -0800 Subject: [PATCH 05/12] Fix scalastyle Signed-off-by: Navin Kumar --- sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 826115dfa18..bdecf1cd3fc 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -32,8 +32,8 @@ import org.apache.commons.text.StringEscapeUtils import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, NullIntolerant, TimeZoneAwareExpression, UnaryExpression} -import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_SECOND +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.GpuToTimestamp.replaceSpecialDates import org.apache.spark.sql.rapids.shims.RapidsErrorUtils From 6706e2fb31714bdccec64cc431376ee9b14310de Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 28 Dec 2023 23:25:56 -0800 Subject: [PATCH 06/12] Remove unncessary restriction on timestamp_gen for casting timestamp to date Signed-off-by: Navin Kumar --- integration_tests/src/main/python/cast_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index a0034460286..f8ca81a91a2 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -541,10 +541,8 @@ def test_cast_timestamp_to_string(): .selectExpr("cast(a as string)")) def test_cast_timestamp_to_date(): - # need to start about 1 day earlier than the max to avoid overflow in cuDF - gen = TimestampGen(end=datetime(9999, 12, 30, 23, 59, 59, 999999, tzinfo=timezone.utc)) assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen) + lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as date)")) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') From 35e66871d18b943a670d0e863ab4112ecab85695 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 28 Dec 2023 23:33:16 -0800 Subject: [PATCH 07/12] Update test to be more thorough for to_date Signed-off-by: Navin Kumar --- integration_tests/src/main/python/date_time_test.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index e93307a0605..b43f98e65ec 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -428,15 +428,14 @@ def test_string_unix_timestamp_ansi_exception(): error_message="Exception", conf=ansi_enabled_conf) -@pytest.mark.parametrize('data_gen', [StringGen('[0-9]{4}-0[1-9]-[0-2][1-8]')], ids=idfn) -@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.parametrize("ansi_enabled", [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @tz_sensitive_test -def test_to_date(data_gen, ansi_enabled): +def test_to_date(ansi_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), + lambda spark : unary_op_df(spark, date_gen) + .select(f.to_date(f.col("a").cast('string'), "yyyy-MM-dd")), {'spark.sql.ansi.enabled': ansi_enabled}) - @pytest.mark.parametrize('data_gen', [StringGen('0[1-9][0-9]{4}')], ids=idfn) @tz_sensitive_test def test_to_date_format_MMyyyy(data_gen): From b55a4bfc2a657b990f3b8762ed82d5de99920dbb Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 28 Dec 2023 23:34:47 -0800 Subject: [PATCH 08/12] Cleanup formatting Signed-off-by: Navin Kumar --- sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index bdecf1cd3fc..9bf9144db0e 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -627,7 +627,7 @@ object GpuCast { GpuIntervalUtils.intToYearMonthInterval(input, ym) case (TimestampType, DateType) if options.timeZoneId.isDefined => val zoneId = DateTimeUtils.getZoneId(options.timeZoneId.get) - withResource(GpuTimeZoneDB.fromUtcTimestampToTimestamp( input.asInstanceOf[ColumnVector], + withResource(GpuTimeZoneDB.fromUtcTimestampToTimestamp(input.asInstanceOf[ColumnVector], zoneId.normalized())) { shifted => shifted.castTo(GpuColumnVector.getNonNestedRapidsType(toDataType)) } From 02eac1d4a9904b4f7d55a25dc457593862712379 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 2 Jan 2024 23:40:29 -0800 Subject: [PATCH 09/12] GetTimestamp fix. --- .../org/apache/spark/sql/rapids/datetimeExpressions.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 3a338c91a09..09d4a977084 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -856,7 +856,7 @@ abstract class GpuToTimestamp val tmp = lhs.dataType match { case _: StringType => // rhs is ignored we already parsed the format - if (getTimeParserPolicy == LegacyTimeParserPolicy) { + val res = if (getTimeParserPolicy == LegacyTimeParserPolicy) { parseStringAsTimestampWithLegacyParserPolicy( lhs, sparkFormat, @@ -871,6 +871,11 @@ abstract class GpuToTimestamp DType.TIMESTAMP_MICROSECONDS, failOnError) } + if (GpuOverrides.isUTCTimezone(zoneId)) { + res + } else { + GpuTimeZoneDB.fromTimestampToUtcTimestamp(res, zoneId) + } case _: DateType => timeZoneId match { case Some(_) => From 743aef2212d77ec1eed3fc2e974ddf856be82acd Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 3 Jan 2024 14:02:06 -0800 Subject: [PATCH 10/12] Add test for to_timestamp and add special cases for Asia/Shanghai transition times Signed-off-by: Navin Kumar --- .../src/main/python/date_time_test.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 5e94acc77c3..abba8b1fecd 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -14,7 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_and_cpu_error -from conftest import is_utc, is_supported_time_zone +from conftest import is_utc, is_supported_time_zone, get_test_tz from data_gen import * from datetime import date, datetime, timezone from marks import ignore_order, incompat, allow_non_gpu, datagen_overrides, tz_sensitive_test @@ -394,7 +394,7 @@ def test_unix_timestamp(data_gen, ansi_enabled): (StringGen('[0-9]{4}/[01][12]/[0-2][1-8]'),'yyyy/MM/dd'), (StringGen('[01][12]/[0-2][1-8]'), 'MM/dd'), (StringGen('[0-2][1-8]/[01][12]'), 'dd/MM'), - (ConvertGen(DateGen(nullable=False), lambda d: d.strftime('%Y/%m').zfill(7), data_type=StringType()), 'yyyy/MM')] + (ConvertGen(DateGen(nullable=False), lambda d: d.strfte('%Y/%m').zfill(7), data_type=StringType()), 'yyyy/MM')] # get invalid date string df def invalid_date_string_df(spark): @@ -428,16 +428,36 @@ def test_string_unix_timestamp_ansi_exception(): error_message="Exception", conf=ansi_enabled_conf) -@pytest.mark.parametrize("ansi_enabled", [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @tz_sensitive_test +@pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) +def test_to_timestamp(parser_policy): + gen = StringGen("[0-9]{3}[1-9]-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]) ([0-1][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]") + if get_test_tz() == "Asia/Shanghai": + # ensure some times around transition are tested + gen = gen.with_special_case("1991-04-14 02:00:00")\ + .with_special_case("1991-04-14 02:30:00")\ + .with_special_case("1991-04-14 03:00:00")\ + .with_special_case("1991-09-15 02:00:00")\ + .with_special_case("1991-09-15 02:30:00")\ + .with_special_case("1991-09-15 03:00:00") + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, gen) + .select(f.col("a"), f.to_timestamp(f.col("a"), "yyyy-MM-dd HH:mm:ss")), + { "spark.sql.legacy.timeParserPolicy": parser_policy}) + + +@tz_sensitive_test +@pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") +@pytest.mark.parametrize("ansi_enabled", [True, False], ids=['ANSI_ON', 'ANSI_OFF']) def test_to_date(ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, date_gen) .select(f.to_date(f.col("a").cast('string'), "yyyy-MM-dd")), {'spark.sql.ansi.enabled': ansi_enabled}) -@pytest.mark.parametrize('data_gen', [StringGen('0[1-9][0-9]{4}')], ids=idfn) @tz_sensitive_test +@pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") +@pytest.mark.parametrize('data_gen', [StringGen('0[1-9][0-9]{4}')], ids=idfn) def test_to_date_format_MMyyyy(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) From dddc479e27a1f9231564ac13b25caf0d65d5491c Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 3 Jan 2024 15:16:03 -0800 Subject: [PATCH 11/12] Some typo snuck into this PR, fixing Signed-off-by: Navin Kumar --- integration_tests/src/main/python/date_time_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index abba8b1fecd..84be8e58647 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -394,7 +394,7 @@ def test_unix_timestamp(data_gen, ansi_enabled): (StringGen('[0-9]{4}/[01][12]/[0-2][1-8]'),'yyyy/MM/dd'), (StringGen('[01][12]/[0-2][1-8]'), 'MM/dd'), (StringGen('[0-2][1-8]/[01][12]'), 'dd/MM'), - (ConvertGen(DateGen(nullable=False), lambda d: d.strfte('%Y/%m').zfill(7), data_type=StringType()), 'yyyy/MM')] + (ConvertGen(DateGen(nullable=False), lambda d: d.strftime('%Y/%m').zfill(7), data_type=StringType()), 'yyyy/MM')] # get invalid date string df def invalid_date_string_df(spark): From 1579db22bbf46659461eb97dd2ece433e3b1281c Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 3 Jan 2024 21:07:19 -0800 Subject: [PATCH 12/12] test was missing the skip for non-supported timezones Signed-off-by: Navin Kumar --- integration_tests/src/main/python/date_time_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 84be8e58647..8eab7f1e231 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -429,6 +429,7 @@ def test_string_unix_timestamp_ansi_exception(): conf=ansi_enabled_conf) @tz_sensitive_test +@pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") @pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) def test_to_timestamp(parser_policy): gen = StringGen("[0-9]{3}[1-9]-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]) ([0-1][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]")