From a38e00aeca03d3019ec6fab7f8ac72ca6fc7f9e3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 31 Aug 2023 10:57:07 -0600 Subject: [PATCH 1/9] add failing test Signed-off-by: Andy Grove --- integration_tests/src/main/python/csv_test.py | 30 +++++++++++++++++++ integration_tests/src/main/python/data_gen.py | 8 ++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 5227dd0a41c..d204b63cee9 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -551,6 +551,36 @@ def test_csv_read_count(spark_tmp_path): assert_gpu_and_cpu_row_counts_equal(lambda spark: spark.read.csv(data_path)) +@allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') +@pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') +@pytest.mark.parametrize('date_format', csv_supported_date_formats) +@pytest.mark.parametrize('ts_part', csv_supported_ts_parts) +@pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +def test_csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type): + full_format = date_format + ts_part + # specify to use no timezone rather than defaulting to UTC + data_gen = TimestampGen(tzinfo=None) + gen = StructGen([('a', data_gen)], nullable=False) + data_path = spark_tmp_path + '/CSV_DATA' + with_cpu_session( + lambda spark : gen_df(spark, gen).write + .option('timestampFormat', full_format) + .csv(data_path)) + + conf = { 'spark.sql.timestampType': timestamp_type, + 'spark.rapids.sql.explain': 'ALL'} + + def do_read(spark): + return spark.read.option("inferSchema", "true") \ + .option('timestampFormat', full_format) \ + .csv(data_path) + + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark: do_read(spark), + exist_classes = 'GpuFileSourceScanExec', + non_exist_classes = 'FileSourceScanExec', + conf = conf) + @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') def test_csv_prefer_date_with_infer_schema(spark_tmp_path): diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index ab9890b0712..6664c2a9d2f 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -572,7 +572,7 @@ def start(self, rand): class TimestampGen(DataGen): """Generate Timestamps in a given range. All timezones are UTC by default.""" - def __init__(self, start=None, end=None, nullable=True): + def __init__(self, start=None, end=None, nullable=True, tzinfo=timezone.utc): super().__init__(TimestampType(), nullable=nullable) if start is None: # Spark supports times starting at @@ -580,19 +580,20 @@ def __init__(self, start=None, end=None, nullable=True): # but it has issues if you get really close to that because it tries to do things # in a different format which causes roundoff, so we have to add a few days, # just to be sure - start = datetime(1, 1, 3, tzinfo=timezone.utc) + start = datetime(1, 1, 3, tzinfo=tzinfo) elif not isinstance(start, datetime): raise RuntimeError('Unsupported type passed in for start {}'.format(start)) if end is None: # Spark supports time through # "9999-12-31 23:59:59.999999" - end = datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=timezone.utc) + end = datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=tzinfo) elif isinstance(end, timedelta): end = start + end elif not isinstance(start, date): raise RuntimeError('Unsupported type passed in for end {}'.format(end)) + self._epoch = datetime(1970, 1, 1, tzinfo=tzinfo) self._start_time = self._to_us_since_epoch(start) self._end_time = self._to_us_since_epoch(end) if (self._epoch >= start and self._epoch <= end): @@ -601,7 +602,6 @@ def __init__(self, start=None, end=None, nullable=True): def _cache_repr(self): return super()._cache_repr() + '(' + str(self._start_time) + ',' + str(self._end_time) + ')' - _epoch = datetime(1970, 1, 1, tzinfo=timezone.utc) _us = timedelta(microseconds=1) def _to_us_since_epoch(self, val): From 9b461d1fc19ad85a2dec8394ebf147f328b9993a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 31 Aug 2023 11:34:30 -0600 Subject: [PATCH 2/9] test now reflects current behavior of falling back to CPU for timestamp_ntz --- integration_tests/src/main/python/csv_test.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index d204b63cee9..f30f53fea19 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -567,19 +567,30 @@ def test_csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, ti .option('timestampFormat', full_format) .csv(data_path)) - conf = { 'spark.sql.timestampType': timestamp_type, - 'spark.rapids.sql.explain': 'ALL'} - def do_read(spark): return spark.read.option("inferSchema", "true") \ .option('timestampFormat', full_format) \ .csv(data_path) - assert_cpu_and_gpu_are_equal_collect_with_capture( - lambda spark: do_read(spark), - exist_classes = 'GpuFileSourceScanExec', - non_exist_classes = 'FileSourceScanExec', - conf = conf) + conf = { 'spark.sql.timestampType': timestamp_type, + 'spark.rapids.sql.explain': 'ALL'} + + # determine whether Spark CPU infers TimestampType or TimestampNtzType + inferred_type = with_cpu_session( + lambda spark : do_read(spark).schema["_c0"].dataType.typeName(), conf=conf) + + if inferred_type == "timestamp_ntz": + # we fall back to CPU due to "unsupported data types in output: TimestampNTZType" + assert_gpu_fallback_collect( + lambda spark: do_read(spark), + cpu_fallback_class_name = 'FileSourceScanExec', + conf = conf) + else: + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark: do_read(spark), + exist_classes = 'GpuFileSourceScanExec', + non_exist_classes = 'FileSourceScanExec', + conf = conf) @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') From 27bc2d254bc92e098859dd6ae4f27320122b296e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 31 Aug 2023 11:51:02 -0600 Subject: [PATCH 3/9] update docs --- docs/compatibility.md | 6 +++--- integration_tests/src/main/python/data_gen.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index b5cb01757dd..5f01dee5a5e 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -123,9 +123,9 @@ Only a limited set of formats are supported when parsing dates. ### CSV Timestamps The CSV parser does not support time zones. It will ignore any trailing time zone information, -despite the format asking for a `XXX` or `[XXX]`. As such it is off by default and you can enable it -by setting [`spark.rapids.sql.csvTimestamps.enabled`](configs.md#sql.csvTimestamps.enabled) to -`true`. +despite the format asking for a `XXX` or `[XXX]`. The CSV parser does not support the `TimestampNTZ` +type and wil fall back to CPU if `spark.sql.timeStamp` is set to `TIMESTAMP_NTZ` or if an +explicit schema is provided that contains the `TimestampNTZ` type. The formats supported for timestamps are limited similar to dates. The first part of the format must be a supported date format. The second part must start with a `'T'` to separate the time diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 6664c2a9d2f..9f549adfa46 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -573,7 +573,7 @@ def start(self, rand): class TimestampGen(DataGen): """Generate Timestamps in a given range. All timezones are UTC by default.""" def __init__(self, start=None, end=None, nullable=True, tzinfo=timezone.utc): - super().__init__(TimestampType(), nullable=nullable) + super().__init__(TimestampNTZType() if tzinfo==None else TimestampType(), nullable=nullable) if start is None: # Spark supports times starting at # "0001-01-01 00:00:00.000000" From 7415698d4b3c2bc59e8378268f795665090cd5f4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 31 Aug 2023 11:55:27 -0600 Subject: [PATCH 4/9] fix typos Signed-off-by: Andy Grove --- docs/compatibility.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 5f01dee5a5e..9600ced062e 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -124,7 +124,7 @@ Only a limited set of formats are supported when parsing dates. ### CSV Timestamps The CSV parser does not support time zones. It will ignore any trailing time zone information, despite the format asking for a `XXX` or `[XXX]`. The CSV parser does not support the `TimestampNTZ` -type and wil fall back to CPU if `spark.sql.timeStamp` is set to `TIMESTAMP_NTZ` or if an +type and will fall back to CPU if `spark.sql.timestampType` is set to `TIMESTAMP_NTZ` or if an explicit schema is provided that contains the `TimestampNTZ` type. The formats supported for timestamps are limited similar to dates. The first part of the format From 06eb8d7a34f4ea61bc429739204a18e1f1ecbe62 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 31 Aug 2023 13:11:00 -0600 Subject: [PATCH 5/9] add json test --- .../src/main/python/json_test.py | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 7c8b2499130..68d5c2d02cb 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -15,8 +15,10 @@ import pyspark.sql.functions as f import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_row_counts_equal, \ + assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture from data_gen import * +from datetime import timezone from conftest import is_databricks_runtime from marks import approximate_float, allow_non_gpu, ignore_order from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330 @@ -200,6 +202,49 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena .json(data_path), conf=updated_conf) +@allow_non_gpu('BatchScanExec', 'FileSourceScanExec', 'ProjectExec') +@pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') +@pytest.mark.parametrize('ts_part', json_supported_ts_parts) +@pytest.mark.parametrize('date_format', json_supported_date_formats) +@pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +@pytest.mark.parametrize('v1_enabled_list', ["", "json"]) +def test_json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, v1_enabled_list): + full_format = date_format + ts_part + data_gen = TimestampGen(tzinfo=None if timestamp_type == "TIMESTAMP_NTZ" else timezone.utc) + gen = StructGen([('a', data_gen)], nullable=False) + data_path = spark_tmp_path + '/JSON_DATA' + schema = gen.data_type + with_cpu_session( + lambda spark : gen_df(spark, gen).write \ + .option('timestampFormat', full_format) \ + .json(data_path)) + updated_conf = copy_and_update(_enable_all_types_conf, + { + 'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.sql.timestampType': timestamp_type + }) + + def do_read(spark): + return spark.read \ + .schema(schema) \ + .option('timestampFormat', full_format) \ + .json(data_path) + + cpu_scan_class = 'BatchScanExec' if v1_enabled_list == '' else 'FileSourceScanExec' + + if timestamp_type == "TIMESTAMP_LTZ": + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark : do_read(spark), + exist_classes = 'Gpu' + cpu_scan_class, + non_exist_classes = cpu_scan_class, + conf=updated_conf) + else: + # we fall back to CPU due to "unsupported data types in output: TimestampNTZType" + assert_gpu_fallback_collect( + lambda spark : do_read(spark), + cpu_fallback_class_name = cpu_scan_class, + conf=updated_conf) + @approximate_float @pytest.mark.parametrize('filename', [ 'boolean.json', From 8e25b6140ad6fb44f2e8a12b841d8d3c10c5b180 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 31 Aug 2023 13:11:53 -0600 Subject: [PATCH 6/9] update docs --- docs/compatibility.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/compatibility.md b/docs/compatibility.md index 9600ced062e..01f9707e17a 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -350,6 +350,9 @@ Spark will treat them as invalid inputs and will just return `null`. ### JSON Timestamps +The JSON parser does not support the `TimestampNTZ` type and will fall back to CPU if `spark.sql.timestampType` is +set to `TIMESTAMP_NTZ` or if an explicit schema is provided that contains the `TimestampNTZ` type. + There is currently no support for reading numeric values as timestamps and null values are returned instead ([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast to timestamp. From 3611bda242ef5e7d1d3a88e7be40df50c4448da9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 Sep 2023 16:50:53 -0600 Subject: [PATCH 7/9] refactor json test --- integration_tests/src/main/python/json_test.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 68d5c2d02cb..b8200e4366d 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -202,13 +202,23 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena .json(data_path), conf=updated_conf) -@allow_non_gpu('BatchScanExec', 'FileSourceScanExec', 'ProjectExec') +@allow_non_gpu('BatchScanExec', 'ProjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -@pytest.mark.parametrize('v1_enabled_list', ["", "json"]) -def test_json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, v1_enabled_list): +def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): + json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') + +@allow_non_gpu('FileSourceScanExec', 'ProjectExec') +@pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') +@pytest.mark.parametrize('ts_part', json_supported_ts_parts) +@pytest.mark.parametrize('date_format', json_supported_date_formats) +@pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +def test_json_ts_formats_round_trip_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): + json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') + +def json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, v1_enabled_list, cpu_scan_class): full_format = date_format + ts_part data_gen = TimestampGen(tzinfo=None if timestamp_type == "TIMESTAMP_NTZ" else timezone.utc) gen = StructGen([('a', data_gen)], nullable=False) @@ -230,7 +240,6 @@ def do_read(spark): .option('timestampFormat', full_format) \ .json(data_path) - cpu_scan_class = 'BatchScanExec' if v1_enabled_list == '' else 'FileSourceScanExec' if timestamp_type == "TIMESTAMP_LTZ": assert_cpu_and_gpu_are_equal_collect_with_capture( From 0610b958bfbc377fab4ece1dfe81f1393c335f2e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 18 Sep 2023 10:29:23 -0600 Subject: [PATCH 8/9] implement v1/v2 csv tests --- integration_tests/src/main/python/csv_test.py | 21 ++++++++++++++----- .../src/main/python/json_test.py | 8 +++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index f30f53fea19..12b4accc7f7 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -551,12 +551,23 @@ def test_csv_read_count(spark_tmp_path): assert_gpu_and_cpu_row_counts_equal(lambda spark: spark.read.csv(data_path)) +@allow_non_gpu('FileSourceScanExec', 'ProjectExec', 'CollectLimitExec', 'DeserializeToObjectExec') +@pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') +@pytest.mark.parametrize('date_format', csv_supported_date_formats) +@pytest.mark.parametrize('ts_part', csv_supported_ts_parts) +@pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +def test_csv_infer_schema_timestamp_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): + csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'csv', 'FileSourceScanExec') + @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -def test_csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type): +def test_csv_infer_schema_timestamp_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): + csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'FileSourceScanExec') + +def csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, v1_enabled_list, cpu_scan_class): full_format = date_format + ts_part # specify to use no timezone rather than defaulting to UTC data_gen = TimestampGen(tzinfo=None) @@ -573,7 +584,7 @@ def do_read(spark): .csv(data_path) conf = { 'spark.sql.timestampType': timestamp_type, - 'spark.rapids.sql.explain': 'ALL'} + 'spark.sql.sources.useV1SourceList': v1_enabled_list } # determine whether Spark CPU infers TimestampType or TimestampNtzType inferred_type = with_cpu_session( @@ -583,13 +594,13 @@ def do_read(spark): # we fall back to CPU due to "unsupported data types in output: TimestampNTZType" assert_gpu_fallback_collect( lambda spark: do_read(spark), - cpu_fallback_class_name = 'FileSourceScanExec', + cpu_fallback_class_name = cpu_scan_class, conf = conf) else: assert_cpu_and_gpu_are_equal_collect_with_capture( lambda spark: do_read(spark), - exist_classes = 'GpuFileSourceScanExec', - non_exist_classes = 'FileSourceScanExec', + exist_classes = 'Gpu' + cpu_scan_class, + non_exist_classes = cpu_scan_class, conf = conf) @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index b8200e4366d..181e8f11bd2 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -202,21 +202,21 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena .json(data_path), conf=updated_conf) -@allow_non_gpu('BatchScanExec', 'ProjectExec') +@allow_non_gpu('FileSourceScanExec', 'ProjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): - json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') + json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') -@allow_non_gpu('FileSourceScanExec', 'ProjectExec') +@allow_non_gpu('BatchScanExec', 'ProjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) def test_json_ts_formats_round_trip_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): - json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') + json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') def json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, v1_enabled_list, cpu_scan_class): full_format = date_format + ts_part From 9bc7f94cab8cac3890f9f7b764ca631e40dacff6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 18 Sep 2023 10:57:08 -0600 Subject: [PATCH 9/9] csv tests pass --- integration_tests/src/main/python/csv_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 12b4accc7f7..bae076f5a4d 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -559,13 +559,13 @@ def test_csv_read_count(spark_tmp_path): def test_csv_infer_schema_timestamp_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'csv', 'FileSourceScanExec') -@allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') +@allow_non_gpu('BatchScanExec', 'FileSourceScanExec', 'ProjectExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`TIMESTAMP_NTZ` is only supported in Spark 340+') @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) def test_csv_infer_schema_timestamp_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): - csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'FileSourceScanExec') + csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') def csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, v1_enabled_list, cpu_scan_class): full_format = date_format + ts_part