From 57fbd5ec20cb5104b6df1f2bbcc2b9b581313f88 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Thu, 15 Aug 2024 19:35:47 +0300 Subject: [PATCH] catalog.has_header true be default --- datafusion/common/src/config.rs | 2 +- .../common/src/file_options/csv_writer.rs | 2 +- .../core/src/datasource/file_format/csv.rs | 23 +++++++++++++++++-- .../tests/user_defined/user_defined_plan.rs | 13 ++++++++--- datafusion/sqllogictest/test_files/copy.slt | 6 ++--- .../sqllogictest/test_files/csv_files.slt | 11 +++++---- datafusion/sqllogictest/test_files/ddl.slt | 7 ++++-- .../sqllogictest/test_files/group_by.slt | 3 ++- .../test_files/information_schema.slt | 4 ++-- datafusion/sqllogictest/test_files/limit.slt | 2 +- datafusion/sqllogictest/test_files/order.slt | 6 +++-- .../sqllogictest/test_files/projection.slt | 6 +++-- datafusion/sqllogictest/test_files/window.slt | 3 ++- 13 files changed, 62 insertions(+), 26 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index c48845c061e7..37d26c6f00c4 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -183,7 +183,7 @@ config_namespace! { /// Default value for `format.has_header` for `CREATE EXTERNAL TABLE` /// if not specified explicitly in the statement. - pub has_header: bool, default = false + pub has_header: bool, default = true /// Specifies whether newlines in (quoted) CSV values are supported. /// diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index ae069079a68f..943288af9164 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -50,7 +50,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions { fn try_from(value: &CsvOptions) -> Result { let mut builder = WriterBuilder::default() - .with_header(value.has_header.unwrap_or(false)) + .with_header(value.has_header.unwrap_or(true)) .with_quote(value.quote) .with_delimiter(value.delimiter); diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index c55f678aef0f..24d55ea54068 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -369,7 +369,7 @@ impl FileFormat for CsvFormat { async fn create_writer_physical_plan( &self, input: Arc, - _state: &SessionState, + state: &SessionState, conf: FileSinkConfig, order_requirements: Option>, ) -> Result> { @@ -377,7 +377,26 @@ impl FileFormat for CsvFormat { return not_impl_err!("Overwrites are not implemented yet for CSV"); } - let writer_options = CsvWriterOptions::try_from(&self.options)?; + // `has_header` and `newlines_in_values` fields of CsvOptions may inherit + // their values from session from configuration settings. To support + // this logic, writer options are built from the copy of `self.options` + // with updated values of these special fields. + let has_header = self + .options() + .has_header + .unwrap_or(state.config_options().catalog.has_header); + let newlines_in_values = self + .options() + .newlines_in_values + .unwrap_or(state.config_options().catalog.newlines_in_values); + + let options = self + .options() + .clone() + .with_has_header(has_header) + .with_newlines_in_values(newlines_in_values); + + let writer_options = CsvWriterOptions::try_from(&options)?; let sink_schema = conf.output_schema().clone(); let sink = Arc::new(CsvSink::new(conf, writer_options)); diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index 1aa33fc75e5d..62ba113da0d3 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -113,7 +113,11 @@ async fn exec_sql(ctx: &SessionContext, sql: &str) -> Result { /// Create a test table. async fn setup_table(ctx: SessionContext) -> Result { - let sql = "CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT) STORED AS CSV location 'tests/data/customer.csv'"; + let sql = " + CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT) + STORED AS CSV location 'tests/data/customer.csv' + OPTIONS('format.has_header' 'false') + "; let expected = vec!["++", "++"]; @@ -125,8 +129,11 @@ async fn setup_table(ctx: SessionContext) -> Result { } async fn setup_table_without_schemas(ctx: SessionContext) -> Result { - let sql = - "CREATE EXTERNAL TABLE sales STORED AS CSV location 'tests/data/customer.csv'"; + let sql = " + CREATE EXTERNAL TABLE sales + STORED AS CSV location 'tests/data/customer.csv' + OPTIONS('format.has_header' 'false') + "; let expected = vec!["++", "++"]; diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index ebb3ca2173b8..d2a3a214d71e 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -417,7 +417,7 @@ COPY source_table to 'test_files/scratch/copy/table_csv' STORED AS CSV OPTIONS # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.compression' 'gzip'); +CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.has_header' false, 'format.compression' gzip); query IT select * from validate_csv; @@ -427,7 +427,7 @@ select * from validate_csv; # Copy from table to single csv query I -COPY source_table to 'test_files/scratch/copy/table.csv'; +COPY source_table to 'test_files/scratch/copy/table.csv' OPTIONS ('format.has_header' false); ---- 2 @@ -478,7 +478,7 @@ query I COPY source_table to 'test_files/scratch/copy/table_csv_with_options' STORED AS CSV OPTIONS ( -'format.has_header' false, +'format.has_header' true, 'format.compression' uncompressed, 'format.datetime_format' '%FT%H:%M:%S.%9f', 'format.delimiter' ';', diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 3fb9a6f20c24..7cb21abdba10 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -117,14 +117,14 @@ CREATE TABLE src_table_2 ( query I COPY src_table_1 TO 'test_files/scratch/csv_files/csv_partitions/1.csv' -STORED AS CSV; +STORED AS CSV OPTIONS ('format.has_header' 'false'); ---- 4 query I COPY src_table_2 TO 'test_files/scratch/csv_files/csv_partitions/2.csv' -STORED AS CSV; +STORED AS CSV OPTIONS ('format.has_header' 'false'); ---- 4 @@ -210,7 +210,7 @@ COPY (VALUES ('#second line is a comment'), ('2,3')) TO 'test_files/scratch/csv_files/file_with_comments.csv' -OPTIONS ('format.delimiter' '|'); +OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE stored_table_with_comments ( @@ -219,7 +219,8 @@ CREATE EXTERNAL TABLE stored_table_with_comments ( ) STORED AS CSV LOCATION 'test_files/scratch/csv_files/file_with_comments.csv' OPTIONS ('format.comment' '#', - 'format.delimiter' ','); + 'format.delimiter' ',', + 'format.has_header' 'false'); query TT SELECT * from stored_table_with_comments; @@ -315,7 +316,7 @@ col1 TEXT, col2 TEXT ) STORED AS CSV LOCATION '../core/tests/data/newlines_in_values.csv' -OPTIONS ('format.newlines_in_values' 'true'); +OPTIONS ('format.newlines_in_values' 'true', 'format.has_header' 'false'); query TT select * from stored_table_with_newlines_in_values_safe; diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index a35e688479e7..7164425fc0f5 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -470,7 +470,9 @@ statement ok CREATE EXTERNAL TABLE csv_with_timestamps ( name VARCHAR, ts TIMESTAMP -) STORED AS CSV LOCATION '../core/tests/data/timestamps.csv'; +) STORED AS CSV +LOCATION '../core/tests/data/timestamps.csv' +OPTIONS('format.has_header' 'false'); query TP SELECT * from csv_with_timestamps @@ -496,7 +498,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps ( ) STORED AS CSV PARTITIONED BY (c_date) -LOCATION '../core/tests/data/partitioned_table'; +LOCATION '../core/tests/data/partitioned_table' +OPTIONS('format.has_header' 'false'); query TPD SELECT * from csv_with_timestamps where c_date='2018-11-13' diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 5571315e2acc..3d78bd06c30b 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4264,7 +4264,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps ( ) STORED AS CSV WITH ORDER (ts DESC) -LOCATION '../core/tests/data/timestamps.csv'; +LOCATION '../core/tests/data/timestamps.csv' +OPTIONS('format.has_header' 'false'); # below query should run since it operates on a bounded source and have a sort # at the top of its plan. diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index ff793a72fd8a..efd017a90bc4 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -165,7 +165,7 @@ datafusion.catalog.create_default_catalog_and_schema true datafusion.catalog.default_catalog datafusion datafusion.catalog.default_schema public datafusion.catalog.format NULL -datafusion.catalog.has_header false +datafusion.catalog.has_header true datafusion.catalog.information_schema true datafusion.catalog.location NULL datafusion.catalog.newlines_in_values false @@ -255,7 +255,7 @@ datafusion.catalog.create_default_catalog_and_schema true Whether the default ca datafusion.catalog.default_catalog datafusion The default catalog name - this impacts what SQL queries use if not specified datafusion.catalog.default_schema public The default schema name - this impacts what SQL queries use if not specified datafusion.catalog.format NULL Type of `TableProvider` to use when loading `default` schema -datafusion.catalog.has_header false Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. +datafusion.catalog.has_header true Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. datafusion.catalog.information_schema true Should DataFusion provide access to `information_schema` virtual tables for displaying schema information datafusion.catalog.location NULL Location scanned to load tables for `default` schema datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index 439df7fede51..7341a9d43bac 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -521,7 +521,7 @@ drop table aggregate_test_100; query I COPY (select * from (values (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e') -)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV; +)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV OPTIONS ('format.has_header' 'false'); ---- 5 diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 569602166b38..f0151417e555 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -98,7 +98,8 @@ NULL three statement ok CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv' +OPTIONS('format.has_header' 'false'); # Demonstrate types query TTT @@ -463,7 +464,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps ( ) STORED AS CSV WITH ORDER (ts ASC NULLS LAST) -LOCATION '../core/tests/data/timestamps.csv'; +LOCATION '../core/tests/data/timestamps.csv' +OPTIONS('format.has_header' 'false'); query TT EXPLAIN SELECT DATE_BIN(INTERVAL '15 minutes', ts, TIMESTAMP '2022-08-03 14:40:00Z') as db15 diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index 3c8855e34712..b5bcb5b4c6f7 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -64,11 +64,13 @@ CREATE TABLE cpu_load_short(host STRING NOT NULL) AS VALUES statement ok CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv' +OPTIONS('format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE test_simple (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv' +OPTIONS('format.has_header' 'false'); # projection same fields query I rowsort diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index ddf6a7aabffc..f56ac414a302 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -49,7 +49,8 @@ OPTIONS ('format.has_header' 'true'); ### execute_with_partition with 4 partitions statement ok CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv' +OPTIONS('format.has_header' 'false'); # for window functions without order by the first, last, and nth function call does not make sense