From e4bbffa4263bc5409faead8aa0f0820729cd0440 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 24 Nov 2024 07:14:59 -0500 Subject: [PATCH 1/8] Temp patch to new sqlparser --- Cargo.toml | 4 + datafusion-cli/Cargo.lock | 189 +++++++++++++++++++++----------------- 2 files changed, 107 insertions(+), 86 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 76bc50d59a08..6e2127430a8e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -175,3 +175,7 @@ large_futures = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_qualifications = "deny" + +## Temp patch for sqlparser +[patch.crates-io] +sqlparser = { git = "https://github.com/apache/datafusion-sqlparser-rs.git", rev = "4ab3ab91473d152c652e6582b63abb13535703f9" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 04b0b0d22cfd..54c6623ba2b1 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -220,7 +220,7 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.1", + "hashbrown 0.15.2", "num", ] @@ -406,9 +406,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.17" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" +checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" dependencies = [ "bzip2", "flate2", @@ -814,9 +814,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" dependencies = [ "arrayref", "arrayvec", @@ -880,9 +880,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "bytes-utils" @@ -917,9 +917,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" +checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" dependencies = [ "jobserver", "libc", @@ -1080,6 +1080,16 @@ dependencies = [ "libc", ] +[[package]] +name = "core-foundation" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -1097,9 +1107,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca741a962e1b0bff6d724a1a0958b686406e853bb14061f218562e1896f95e6" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" dependencies = [ "libc", ] @@ -1700,12 +1710,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1972,9 +1982,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" [[package]] name = "heck" @@ -1988,12 +1998,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "hex" version = "0.4.3" @@ -2162,8 +2166,8 @@ dependencies = [ "http 1.1.0", "hyper 1.5.1", "hyper-util", - "rustls 0.23.17", - "rustls-native-certs 0.8.0", + "rustls 0.23.19", + "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2358,7 +2362,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] @@ -2390,9 +2394,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "jobserver" @@ -2405,10 +2409,11 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -2484,9 +2489,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.164" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "libflate" @@ -2546,9 +2551,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "litemap" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "lock_api" @@ -2628,11 +2633,10 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi", "libc", "wasi", "windows-sys 0.52.0", @@ -2862,7 +2866,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.1", + "hashbrown 0.15.2", "lz4_flex", "num", "num-bigint", @@ -3020,9 +3024,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -3063,7 +3067,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.17", + "rustls 0.23.19", "socket2", "thiserror 2.0.3", "tokio", @@ -3081,7 +3085,7 @@ dependencies = [ "rand", "ring", "rustc-hash", - "rustls 0.23.17", + "rustls 0.23.19", "rustls-pki-types", "slab", "thiserror 2.0.3", @@ -3259,8 +3263,8 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.17", - "rustls-native-certs 0.8.0", + "rustls 0.23.19", + "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", @@ -3378,9 +3382,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.17" +version = "0.23.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f1a745511c54ba6d4465e8d5dfbd81b45791756de28d4981af70d6dca128f1e" +checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1" dependencies = [ "once_cell", "ring", @@ -3399,20 +3403,19 @@ dependencies = [ "openssl-probe", "rustls-pemfile 1.0.4", "schannel", - "security-framework", + "security-framework 2.11.1", ] [[package]] name = "rustls-native-certs" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" dependencies = [ "openssl-probe", - "rustls-pemfile 2.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.0.1", ] [[package]] @@ -3538,7 +3541,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ "bitflags 2.6.0", - "core-foundation", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8" +dependencies = [ + "bitflags 2.6.0", + "core-foundation 0.10.0", "core-foundation-sys", "libc", "security-framework-sys", @@ -3686,9 +3702,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -3801,9 +3817,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.87" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -4009,7 +4025,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.17", + "rustls 0.23.19", "rustls-pki-types", "tokio", ] @@ -4052,9 +4068,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -4063,9 +4079,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", @@ -4074,9 +4090,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", ] @@ -4155,9 +4171,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.3" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -4246,9 +4262,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" dependencies = [ "cfg-if", "once_cell", @@ -4257,9 +4273,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" dependencies = [ "bumpalo", "log", @@ -4272,21 +4288,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.45" +version = "0.4.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" +checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d" dependencies = [ "cfg-if", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4294,9 +4311,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" dependencies = [ "proc-macro2", "quote", @@ -4307,9 +4324,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" [[package]] name = "wasm-streams" @@ -4326,9 +4343,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c" dependencies = [ "js-sys", "wasm-bindgen", @@ -4578,9 +4595,9 @@ dependencies = [ [[package]] name = "yoke" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" dependencies = [ "serde", "stable_deref_trait", @@ -4590,9 +4607,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", @@ -4623,18 +4640,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", From 3545647960c898c0ee8f144907f43051b13c66a8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 24 Nov 2024 07:31:32 -0500 Subject: [PATCH 2/8] Update for new sqlparser API --- datafusion/common/src/utils/mod.rs | 5 +- .../user_defined_scalar_functions.rs | 10 +- datafusion/expr/src/expr.rs | 2 + datafusion/sql/src/expr/function.rs | 5 + datafusion/sql/src/expr/mod.rs | 4 +- datafusion/sql/src/parser.rs | 14 +- datafusion/sql/src/planner.rs | 10 +- datafusion/sql/src/select.rs | 1 + datafusion/sql/src/statement.rs | 144 +++++++++++++----- datafusion/sql/src/unparser/ast.rs | 3 + datafusion/sql/src/unparser/dialect.rs | 5 +- datafusion/sql/src/unparser/expr.rs | 16 +- datafusion/sql/src/unparser/plan.rs | 9 +- datafusion/sql/src/unparser/utils.rs | 11 +- .../test_files/information_schema.slt | 4 +- 15 files changed, 182 insertions(+), 61 deletions(-) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index f3bba8e254d9..c4a43446e1e5 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -775,10 +775,10 @@ pub fn get_available_parallelism() -> usize { #[cfg(test)] mod tests { + use super::*; use crate::ScalarValue::Null; use arrow::array::Float64Array; - - use super::*; + use sqlparser::tokenizer::Span; #[test] fn test_bisect_linear_left_and_right() -> Result<()> { @@ -1006,6 +1006,7 @@ mod tests { let expected_parsed = vec![Ident { value: identifier.to_string(), quote_style, + span: Span::empty(), }]; assert_eq!( diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index a59394f90814..79577eaa1d50 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -27,10 +27,6 @@ use arrow_array::{ Array, ArrayRef, Float32Array, Float64Array, Int32Array, RecordBatch, StringArray, }; use arrow_schema::{DataType, Field, Schema}; -use parking_lot::Mutex; -use regex::Regex; -use sqlparser::ast::Ident; - use datafusion::execution::context::{FunctionFactory, RegisterFunction, SessionState}; use datafusion::prelude::*; use datafusion::{execution::registry::FunctionRegistry, test_util}; @@ -48,6 +44,10 @@ use datafusion_expr::{ Volatility, }; use datafusion_functions_nested::range::range_udf; +use parking_lot::Mutex; +use regex::Regex; +use sqlparser::ast::Ident; +use sqlparser::tokenizer::Span; /// test that casting happens on udfs. /// c11 is f32, but `custom_sqrt` requires f64. Casting happens but the logical plan and @@ -1187,6 +1187,7 @@ async fn create_scalar_function_from_sql_statement_postgres_syntax() -> Result<( name: Some(Ident { value: "name".into(), quote_style: None, + span: Span::empty(), }), data_type: DataType::Utf8, default_expr: None, @@ -1196,6 +1197,7 @@ async fn create_scalar_function_from_sql_statement_postgres_syntax() -> Result<( language: Some(Ident { value: "plrust".into(), quote_style: None, + span: Span::empty(), }), behavior: None, function_body: Some(lit(body)), diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index c495b5396f53..d0e98327f34b 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -222,6 +222,8 @@ use sqlparser::ast::{ /// // to 42 = 5 AND b = 6 /// assert_eq!(rewritten.data, lit(42).eq(lit(5)).and(col("b").eq(lit(6)))); #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +// TODO make the enum smaller with more boxing (looks like Wildcard is now bigger) +#[allow(clippy::large_enum_variant)] pub enum Expr { /// An expression with a specific name. Alias(Alias), diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 67fa23b86990..15b415e3dc42 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -169,6 +169,11 @@ impl FunctionArgs { "Calling {name}: SEPARATOR not supported in function arguments: {sep}" ) } + FunctionArgumentClause::JsonNullClause(jn) => { + return not_impl_err!( + "Calling {name}: JSON NULL clause not supported in function arguments: {jn}" + ) + } } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 57ac96951f1f..0195f3d32032 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -565,11 +565,11 @@ impl SqlToRel<'_, S> { } not_impl_err!("AnyOp not supported by ExprPlanner: {binary_expr:?}") } - SQLExpr::Wildcard => Ok(Expr::Wildcard { + SQLExpr::Wildcard(_token) => Ok(Expr::Wildcard { qualifier: None, options: WildcardOptions::default(), }), - SQLExpr::QualifiedWildcard(object_name) => Ok(Expr::Wildcard { + SQLExpr::QualifiedWildcard(object_name, _token) => Ok(Expr::Wildcard { qualifier: Some(self.object_name_to_table_reference(object_name)?), options: WildcardOptions::default(), }), diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index bd1ed3145ef5..fb83ecea88b8 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -20,6 +20,7 @@ use std::collections::VecDeque; use std::fmt; +use sqlparser::tokenizer::TokenWithSpan; use sqlparser::{ ast::{ ColumnDef, ColumnOptionDef, Expr, ObjectName, OrderByExpr, Query, @@ -27,7 +28,7 @@ use sqlparser::{ }, dialect::{keywords::Keyword, Dialect, GenericDialect}, parser::{Parser, ParserError}, - tokenizer::{Token, TokenWithLocation, Tokenizer, Word}, + tokenizer::{Token, Tokenizer, Word}, }; // Use `Parser::expected` instead, if possible @@ -337,7 +338,7 @@ impl<'a> DFParser<'a> { fn expected( &self, expected: &str, - found: TokenWithLocation, + found: TokenWithSpan, ) -> Result { parser_err!(format!("Expected {expected}, found: {found}")) } @@ -875,6 +876,7 @@ mod tests { use super::*; use sqlparser::ast::Expr::Identifier; use sqlparser::ast::{BinaryOperator, DataType, Expr, Ident}; + use sqlparser::tokenizer::Span; fn expect_parse_ok(sql: &str, expected: Statement) -> Result<(), ParserError> { let statements = DFParser::parse_sql(sql)?; @@ -910,6 +912,7 @@ mod tests { name: Ident { value: name.into(), quote_style: None, + span: Span::empty(), }, data_type, collation: None, @@ -1218,6 +1221,7 @@ mod tests { expr: Identifier(Ident { value: "c1".to_owned(), quote_style: None, + span: Span::empty(), }), asc, nulls_first, @@ -1249,6 +1253,7 @@ mod tests { expr: Identifier(Ident { value: "c1".to_owned(), quote_style: None, + span: Span::empty(), }), asc: Some(true), nulls_first: None, @@ -1258,6 +1263,7 @@ mod tests { expr: Identifier(Ident { value: "c2".to_owned(), quote_style: None, + span: Span::empty(), }), asc: Some(false), nulls_first: Some(true), @@ -1289,11 +1295,13 @@ mod tests { left: Box::new(Identifier(Ident { value: "c1".to_owned(), quote_style: None, + span: Span::empty(), })), op: BinaryOperator::Minus, right: Box::new(Identifier(Ident { value: "c2".to_owned(), quote_style: None, + span: Span::empty(), })), }, asc: Some(true), @@ -1334,11 +1342,13 @@ mod tests { left: Box::new(Identifier(Ident { value: "c1".to_owned(), quote_style: None, + span: Span::empty(), })), op: BinaryOperator::Minus, right: Box::new(Identifier(Ident { value: "c2".to_owned(), quote_style: None, + span: Span::empty(), })), }, asc: Some(true), diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 382c503154dd..6ef2eb4e7fb2 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -339,7 +339,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { plan: LogicalPlan, alias: TableAlias, ) -> Result { - let plan = self.apply_expr_alias(plan, alias.columns)?; + let idents = alias.columns.into_iter().map(|c| c.name).collect(); + let plan = self.apply_expr_alias(plan, idents)?; LogicalPlanBuilder::from(plan) .alias(TableReference::bare( @@ -586,6 +587,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { | SQLDataType::Nullable(_) | SQLDataType::LowCardinality(_) | SQLDataType::Trigger + // MySQL datatypes + | SQLDataType::TinyBlob + | SQLDataType::MediumBlob + | SQLDataType::LongBlob + | SQLDataType::TinyText + | SQLDataType::MediumText + | SQLDataType::LongText => not_impl_err!( "Unsupported SQL type {sql_type:?}" ), diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 39b6eb6e8132..9899d92bde71 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -654,6 +654,7 @@ impl SqlToRel<'_, S> { opt_rename, opt_replace: _opt_replace, opt_ilike: _opt_ilike, + wildcard_token: _wildcard_token, } = options; if opt_rename.is_some() { diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 38695f98b5fe..47a842643b8d 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -54,13 +54,13 @@ use datafusion_expr::{ TransactionConclusion, TransactionEnd, TransactionIsolationLevel, TransactionStart, Volatility, WriteOp, }; -use sqlparser::ast::{self, SqliteOnConflict}; +use sqlparser::ast::{self, ShowStatementIn, ShowStatementOptions, SqliteOnConflict}; use sqlparser::ast::{ Assignment, AssignmentTarget, ColumnDef, CreateIndex, CreateTable, CreateTableOptions, Delete, DescribeAlias, Expr as SQLExpr, FromTable, Ident, Insert, ObjectName, ObjectType, OneOrManyWithParens, Query, SchemaName, SetExpr, - ShowCreateObject, ShowStatementFilter, Statement, TableConstraint, TableFactor, - TableWithJoins, TransactionMode, UnaryOperator, Value, + ShowCreateObject, Statement, TableConstraint, TableFactor, TableWithJoins, + TransactionMode, UnaryOperator, Value, }; use sqlparser::parser::ParserError::ParserError; @@ -685,19 +685,99 @@ impl SqlToRel<'_, S> { Statement::ShowTables { extended, full, - db_name, - filter, - // SHOW TABLES IN/FROM are equivalent, this field specifies which the user - // specified, but it doesn't affect the plan so ignore the field - clause: _, - } => self.show_tables_to_plan(extended, full, db_name, filter), + terse, + history, + external, + show_options, + } => { + // We only support the basic "SHOW TABLES" + // https://github.com/apache/datafusion/issues/3188 + if extended { + return not_impl_err!("SHOW TABLES EXTENDED not supported")?; + } + if full { + return not_impl_err!("SHOW FULL TABLES not supported")?; + } + if terse { + return not_impl_err!("SHOW TERSE TABLES not supported")?; + } + if history { + return not_impl_err!("SHOW TABLES HISTORY not supported")?; + } + if external { + return not_impl_err!("SHOW EXTERNAL TABLES not supported")?; + } + let ShowStatementOptions { + show_in, + starts_with, + limit, + limit_from, + filter_position, + } = show_options; + if show_in.is_some() { + return not_impl_err!("SHOW TABLES IN not supported")?; + } + if starts_with.is_some() { + return not_impl_err!("SHOW TABLES LIKE not supported")?; + } + if limit.is_some() { + return not_impl_err!("SHOW TABLES LIMIT not supported")?; + } + if limit_from.is_some() { + return not_impl_err!("SHOW TABLES LIMIT FROM not supported")?; + } + if filter_position.is_some() { + return not_impl_err!("SHOW TABLES FILTER not supported")?; + } + self.show_tables_to_plan() + } Statement::ShowColumns { extended, full, - table_name, - filter, - } => self.show_columns_to_plan(extended, full, table_name, filter), + show_options, + } => { + let ShowStatementOptions { + show_in, + starts_with, + limit, + limit_from, + filter_position, + } = show_options; + if starts_with.is_some() { + return not_impl_err!("SHOW COLUMNS LIKE not supported")?; + } + if limit.is_some() { + return not_impl_err!("SHOW COLUMNS LIMIT not supported")?; + } + if limit_from.is_some() { + return not_impl_err!("SHOW COLUMNS LIMIT FROM not supported")?; + } + if filter_position.is_some() { + return not_impl_err!( + "SHOW COLUMNS with WHERE or LIKE is not supported" + )?; + } + let Some(ShowStatementIn { + // specifies if the syntax was `SHOW COLUMNS IN` or `SHOW + // COLUMNS FROM` which is not different in DataFusion + clause: _, + parent_type, + parent_name, + }) = show_in + else { + return plan_err!("SHOW COLUMNS requires a table name"); + }; + + if let Some(parent_type) = parent_type { + return not_impl_err!("SHOW COLUMNS IN {parent_type} not supported"); + } + let Some(table_name) = parent_name else { + return plan_err!("SHOW COLUMNS requires a table name"); + }; + + self.show_columns_to_plan(extended, full, table_name) + } Statement::Insert(Insert { or, @@ -766,10 +846,14 @@ impl SqlToRel<'_, S> { from, selection, returning, + or, } => { if returning.is_some() { plan_err!("Update-returning clause not yet supported")?; } + if or.is_some() { + plan_err!("ON conflict not supported")?; + } self.update_to_plan(table, assignments, from, selection) } @@ -1065,24 +1149,12 @@ impl SqlToRel<'_, S> { } /// Generate a logical plan from a "SHOW TABLES" query - fn show_tables_to_plan( - &self, - extended: bool, - full: bool, - db_name: Option, - filter: Option, - ) -> Result { + fn show_tables_to_plan(&self) -> Result { if self.has_table("information_schema", "tables") { - // We only support the basic "SHOW TABLES" - // https://github.com/apache/datafusion/issues/3188 - if db_name.is_some() || filter.is_some() || full || extended { - plan_err!("Unsupported parameters to SHOW TABLES") - } else { - let query = "SELECT * FROM information_schema.tables;"; - let mut rewrite = DFParser::parse_sql(query)?; - assert_eq!(rewrite.len(), 1); - self.statement_to_plan(rewrite.pop_front().unwrap()) // length of rewrite is 1 - } + let query = "SELECT * FROM information_schema.tables;"; + let mut rewrite = DFParser::parse_sql(query)?; + assert_eq!(rewrite.len(), 1); + self.statement_to_plan(rewrite.pop_front().unwrap()) // length of rewrite is 1 } else { plan_err!("SHOW TABLES is not supported unless information_schema is enabled") } @@ -1842,22 +1914,18 @@ impl SqlToRel<'_, S> { extended: bool, full: bool, sql_table_name: ObjectName, - filter: Option, ) -> Result { - if filter.is_some() { - return plan_err!("SHOW COLUMNS with WHERE or LIKE is not supported"); - } + // Figure out the where clause + let where_clause = object_name_to_qualifier( + &sql_table_name, + self.options.enable_ident_normalization, + ); if !self.has_table("information_schema", "columns") { return plan_err!( "SHOW COLUMNS is not supported unless information_schema is enabled" ); } - // Figure out the where clause - let where_clause = object_name_to_qualifier( - &sql_table_name, - self.options.enable_ident_normalization, - ); // Do a table lookup to verify the table exists let table_ref = self.object_name_to_table_reference(sql_table_name)?; diff --git a/datafusion/sql/src/unparser/ast.rs b/datafusion/sql/src/unparser/ast.rs index cc0812cd71e1..8812bbcbe0b7 100644 --- a/datafusion/sql/src/unparser/ast.rs +++ b/datafusion/sql/src/unparser/ast.rs @@ -24,6 +24,7 @@ use core::fmt; use sqlparser::ast; +use sqlparser::ast::helpers::attached_token::AttachedToken; #[derive(Clone)] pub(super) struct QueryBuilder { @@ -268,6 +269,7 @@ impl SelectBuilder { connect_by: None, window_before_qualify: false, prewhere: None, + select_token: AttachedToken::empty(), }) } fn create_empty() -> Self { @@ -458,6 +460,7 @@ impl TableRelationBuilder { version: self.version.clone(), partitions: self.partitions.clone(), with_ordinality: false, + json_path: None, }) } fn create_empty() -> Self { diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs index e979d8fd4ebd..2657d9970f1a 100644 --- a/datafusion/sql/src/unparser/dialect.rs +++ b/datafusion/sql/src/unparser/dialect.rs @@ -18,15 +18,15 @@ use std::sync::Arc; use arrow_schema::TimeUnit; +use datafusion_common::Result; use datafusion_expr::Expr; use regex::Regex; +use sqlparser::tokenizer::Span; use sqlparser::{ ast::{self, BinaryOperator, Function, Ident, ObjectName, TimezoneInfo}, keywords::ALL_KEYWORDS, }; -use datafusion_common::Result; - use super::{utils::character_length_to_sql, utils::date_part_to_sql, Unparser}; /// `Dialect` to use for Unparsing @@ -279,6 +279,7 @@ impl PostgreSqlDialect { name: ObjectName(vec![Ident { value: func_name.to_string(), quote_style: None, + span: Span::empty(), }]), args: ast::FunctionArguments::List(ast::FunctionArgumentList { duplicate_treatment: None, diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 48dfc425ee63..88b2e79558ca 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -43,6 +43,8 @@ use datafusion_expr::{ expr::{Alias, Exists, InList, ScalarFunction, Sort, WindowFunction}, Between, BinaryExpr, Case, Cast, Expr, GroupingSet, Like, Operator, TryCast, }; +use sqlparser::ast::helpers::attached_token::AttachedToken; +use sqlparser::tokenizer::Span; /// Convert a DataFusion [`Expr`] to [`ast::Expr`] /// @@ -233,6 +235,7 @@ impl Unparser<'_> { name: ObjectName(vec![Ident { value: func_name.to_string(), quote_style: None, + span: Span::empty(), }]), args: ast::FunctionArguments::List(ast::FunctionArgumentList { duplicate_treatment: None, @@ -278,6 +281,7 @@ impl Unparser<'_> { name: ObjectName(vec![Ident { value: func_name.to_string(), quote_style: None, + span: Span::empty(), }]), args: ast::FunctionArguments::List(ast::FunctionArgumentList { duplicate_treatment: agg @@ -404,12 +408,16 @@ impl Unparser<'_> { } // TODO: unparsing wildcard addition options Expr::Wildcard { qualifier, .. } => { + let attached_token = AttachedToken::empty(); if let Some(qualifier) = qualifier { let idents: Vec = qualifier.to_vec().into_iter().map(Ident::new).collect(); - Ok(ast::Expr::QualifiedWildcard(ObjectName(idents))) + Ok(ast::Expr::QualifiedWildcard( + ObjectName(idents), + attached_token, + )) } else { - Ok(ast::Expr::Wildcard) + Ok(ast::Expr::Wildcard(attached_token)) } } Expr::GroupingSet(grouping_set) => match grouping_set { @@ -480,6 +488,7 @@ impl Unparser<'_> { name: ObjectName(vec![Ident { value: func_name.to_string(), quote_style: None, + span: Span::empty(), }]), args: ast::FunctionArguments::List(ast::FunctionArgumentList { duplicate_treatment: None, @@ -709,6 +718,7 @@ impl Unparser<'_> { Ident { value: ident, quote_style, + span: Span::empty(), } } @@ -716,6 +726,7 @@ impl Unparser<'_> { Ident { value: str, quote_style: None, + span: Span::empty(), } } @@ -1481,6 +1492,7 @@ impl Unparser<'_> { name: ObjectName(vec![Ident { value: "UNNEST".to_string(), quote_style: None, + span: Span::empty(), }]), args: ast::FunctionArguments::List(ast::FunctionArgumentList { duplicate_treatment: None, diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 81e47ed939f2..1a1d4c42b0ff 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -42,7 +42,7 @@ use datafusion_expr::{ expr::Alias, BinaryExpr, Distinct, Expr, JoinConstraint, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, Projection, SortExpr, TableScan, }; -use sqlparser::ast::{self, Ident, SetExpr}; +use sqlparser::ast::{self, Ident, SetExpr, TableAliasColumnDef}; use std::sync::Arc; /// Convert a DataFusion [`LogicalPlan`] to [`ast::Statement`] @@ -1006,6 +1006,13 @@ impl Unparser<'_> { } fn new_table_alias(&self, alias: String, columns: Vec) -> ast::TableAlias { + let columns = columns + .into_iter() + .map(|ident| TableAliasColumnDef { + name: ident, + data_type: None, + }) + .collect(); ast::TableAlias { name: self.new_ident_quoted_if_needs(alias), columns, diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index c77af7924384..38dcbe981efa 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -17,6 +17,10 @@ use std::{cmp::Ordering, sync::Arc, vec}; +use super::{ + dialect::CharacterLengthStyle, dialect::DateFieldExtractStyle, + rewrite::TableAliasRewriter, Unparser, +}; use datafusion_common::{ internal_err, tree_node::{Transformed, TransformedResult, TreeNode}, @@ -28,11 +32,7 @@ use datafusion_expr::{ }; use indexmap::IndexSet; use sqlparser::ast; - -use super::{ - dialect::CharacterLengthStyle, dialect::DateFieldExtractStyle, - rewrite::TableAliasRewriter, Unparser, -}; +use sqlparser::tokenizer::Span; /// Recursively searches children of [LogicalPlan] to find an Aggregate node if exists /// prior to encountering a Join, TableScan, or a nested subquery (derived table factor). @@ -425,6 +425,7 @@ pub(crate) fn date_part_to_sql( name: ast::ObjectName(vec![ast::Ident { value: "strftime".to_string(), quote_style: None, + span: Span::empty(), }]), args: ast::FunctionArguments::List(ast::FunctionArgumentList { duplicate_treatment: None, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 4d51a61c8a52..95098038ae2f 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -483,10 +483,10 @@ set datafusion.catalog.information_schema = true; statement ok CREATE TABLE t AS SELECT 1::int as i; -statement error Error during planning: SHOW COLUMNS with WHERE or LIKE is not supported +statement error DataFusion error: This feature is not implemented: SHOW COLUMNS with WHERE or LIKE is not supported SHOW COLUMNS FROM t LIKE 'f'; -statement error Error during planning: SHOW COLUMNS with WHERE or LIKE is not supported +statement error DataFusion error: This feature is not implemented: SHOW COLUMNS with WHERE or LIKE is not supported SHOW COLUMNS FROM t WHERE column_name = 'bar'; query TTTTTT From 34c388a370642b22bfb46f52dbede005799d3490 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 5 Dec 2024 15:08:22 +0100 Subject: [PATCH 3/8] feat: init diagnostics --- Cargo.toml | 1 + datafusion/common/Cargo.toml | 1 + datafusion/common/src/column.rs | 33 ++++++- datafusion/common/src/dfschema.rs | 2 + datafusion/common/src/diagnostic.rs | 21 ++++ datafusion/common/src/error.rs | 42 ++++++++ datafusion/common/src/lib.rs | 4 + datafusion/common/src/with_span.rs | 96 +++++++++++++++++++ .../core/tests/execution/logical_plan.rs | 2 + datafusion/expr-common/Cargo.toml | 1 + .../expr-common/src/type_coercion/binary.rs | 87 ++++++++++++----- datafusion/expr/src/expr.rs | 10 +- datafusion/expr/src/expr_rewriter/mod.rs | 5 +- datafusion/expr/src/expr_rewriter/order_by.rs | 3 + datafusion/expr/src/expr_schema.rs | 25 ++++- datafusion/expr/src/logical_plan/builder.rs | 2 + .../src/analyzer/expand_wildcard_rule.rs | 20 ++-- .../optimizer/src/analyzer/type_coercion.rs | 1 + datafusion/optimizer/src/decorrelate.rs | 2 +- datafusion/sql/src/expr/identifier.rs | 4 + datafusion/sql/src/parser.rs | 4 +- datafusion/sql/src/unparser/expr.rs | 2 + datafusion/sql/src/unparser/plan.rs | 2 + 23 files changed, 326 insertions(+), 44 deletions(-) create mode 100644 datafusion/common/src/diagnostic.rs create mode 100644 datafusion/common/src/with_span.rs diff --git a/Cargo.toml b/Cargo.toml index 6e2127430a8e..99c441751721 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -149,6 +149,7 @@ sqlparser = { version = "0.52.0", features = ["visitor"] } tempfile = "3" tokio = { version = "1.36", features = ["macros", "rt", "sync"] } url = "2.2" +derivative = "2.2.0" [profile.release] codegen-units = 1 diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index d76848dfe95e..38448b7c3626 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -65,6 +65,7 @@ pyo3 = { version = "0.22.0", optional = true } recursive = { workspace = true } sqlparser = { workspace = true } tokio = { workspace = true } +derivative = { workspace = true } [target.'cfg(target_family = "wasm")'.dependencies] web-time = "1.1.0" diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index d940bcf3146e..fe29a24147c3 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -18,22 +18,35 @@ //! Column use arrow_schema::{Field, FieldRef}; +use sqlparser::tokenizer::Span; use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; use crate::{DFSchema, Result, SchemaError, TableReference}; +use derivative::Derivative; use std::collections::HashSet; use std::convert::Infallible; use std::fmt; use std::str::FromStr; /// A named reference to a qualified field in a schema. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, Derivative)] +#[derivative(PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Column { /// relation/table reference. pub relation: Option, /// field/column name. pub name: String, + #[derivative( + PartialEq = "ignore", + Hash = "ignore", + PartialOrd = "ignore", + Ord = "ignore" + )] + /// The location in the source code where this column originally came from. + /// Does not change as the [`Column`] undergoes normalization and other + /// transformations. + pub span: Span, } impl Column { @@ -50,6 +63,7 @@ impl Column { Self { relation: relation.map(|r| r.into()), name: name.into(), + span: Span::empty(), } } @@ -58,6 +72,7 @@ impl Column { Self { relation: None, name: name.into(), + span: Span::empty(), } } @@ -68,6 +83,7 @@ impl Column { Self { relation: None, name: name.into(), + span: Span::empty(), } } @@ -99,7 +115,11 @@ impl Column { // identifiers will be treated as an unqualified column name _ => return None, }; - Some(Self { relation, name }) + Some(Self { + relation, + name, + span: Span::empty(), + }) } /// Deserialize a fully qualified name string into a column @@ -113,6 +133,7 @@ impl Column { Self { relation: None, name: flat_name, + span: Span::empty(), }, ) } @@ -124,6 +145,7 @@ impl Column { Self { relation: None, name: flat_name, + span: Span::empty(), }, ) } @@ -254,6 +276,13 @@ impl Column { .collect(), }) } + + /// Attaches a [`Span`] to the [`Column`], i.e. its location in the source + /// SQL query. + pub fn with_span(mut self, span: Span) -> Self { + self.span = span; + self + } } impl From<&str> for Column { diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 45620c3cacc8..b53b475edaa0 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -32,6 +32,7 @@ use crate::{ use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; use arrow_schema::SchemaBuilder; +use sqlparser::tokenizer::Span; /// A reference-counted reference to a [DFSchema]. pub type DFSchemaRef = Arc; @@ -505,6 +506,7 @@ impl DFSchema { field: Column { relation: None, name: name.to_string(), + span: Span::empty(), }, }) } diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs new file mode 100644 index 000000000000..1ebf3f9ed566 --- /dev/null +++ b/datafusion/common/src/diagnostic.rs @@ -0,0 +1,21 @@ +use sqlparser::tokenizer::Span; + +#[derive(Debug, Clone)] +pub struct Diagnostic { + pub entries: Vec, +} + +#[derive(Debug, Clone)] +pub struct DiagnosticEntry { + pub span: Span, + pub message: String, + pub kind: DiagnosticEntryKind, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DiagnosticEntryKind { + Error, + Warning, + Note, + Help, +} diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 4fac7298c455..39916e377aae 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -26,6 +26,7 @@ use std::io; use std::result; use std::sync::Arc; +use crate::diagnostic::Diagnostic; use crate::utils::quote_identifier; use crate::{Column, DFSchema, TableReference}; #[cfg(feature = "avro")] @@ -131,6 +132,7 @@ pub enum DataFusionError { /// Errors from either mapping LogicalPlans to/from Substrait plans /// or serializing/deserializing protobytes to Substrait plans Substrait(String), + Diagnostic(Diagnostic, Box), } #[macro_export] @@ -316,6 +318,7 @@ impl Error for DataFusionError { DataFusionError::External(e) => Some(e.as_ref()), DataFusionError::Context(_, e) => Some(e.as_ref()), DataFusionError::Substrait(_) => None, + DataFusionError::Diagnostic(_, e) => Some(e.as_ref()), } } } @@ -372,6 +375,11 @@ impl DataFusionError { Self::Context(description.into(), Box::new(self)) } + /// wraps self in Self::Diagnostic with a [`Diagnostic`] + pub fn with_diagnostic(self, diag: Diagnostic) -> Self { + Self::Diagnostic(diag, Box::new(self)) + } + /// Strips backtrace out of the error message /// If backtrace enabled then error has a format "message" [`Self::BACK_TRACE_SEP`] "backtrace" /// The method strips the backtrace and outputs "message" @@ -429,6 +437,7 @@ impl DataFusionError { DataFusionError::External(_) => "External error: ", DataFusionError::Context(_, _) => "", DataFusionError::Substrait(_) => "Substrait error: ", + DataFusionError::Diagnostic(_, _) => "", } } @@ -469,7 +478,40 @@ impl DataFusionError { Cow::Owned(format!("{desc}\ncaused by\n{}", *err)) } DataFusionError::Substrait(ref desc) => Cow::Owned(desc.to_string()), + DataFusionError::Diagnostic(_, ref err) => Cow::Owned(err.to_string()), + } + } + + pub fn get_diagnostics(&self) -> impl Iterator + '_ { + struct DiagnosticsIterator<'a> { + head: &'a DataFusionError, } + + impl<'a> Iterator for DiagnosticsIterator<'a> { + type Item = &'a Diagnostic; + + fn next(&mut self) -> Option { + loop { + if let DataFusionError::Diagnostic(diagnostics, source) = self.head { + self.head = source.as_ref(); + return Some(diagnostics); + } + + if let Some(source) = self + .head + .source() + .map(|source| source.downcast_ref::()) + .flatten() + { + self.head = source; + } else { + return None; + } + } + } + } + + DiagnosticsIterator { head: self } } } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 77e8cd60ede2..8eed7e18005a 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -47,6 +47,8 @@ pub mod test_util; pub mod tree_node; pub mod types; pub mod utils; +pub mod diagnostic; +pub mod with_span; /// Reexport arrow crate pub use arrow; @@ -76,6 +78,8 @@ pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; pub use utils::project_schema; +pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; +pub use with_span::WithSpan; // These are hidden from docs purely to avoid polluting the public view of what this crate exports. // These are just re-exports of macros by the same name, which gets around the 'cannot refer to diff --git a/datafusion/common/src/with_span.rs b/datafusion/common/src/with_span.rs new file mode 100644 index 000000000000..d2acddaa0d2b --- /dev/null +++ b/datafusion/common/src/with_span.rs @@ -0,0 +1,96 @@ +use std::{cmp::Ordering, fmt::{self, Debug, Display}, ops::{Deref, DerefMut}}; + +use sqlparser::tokenizer::Span; + +/// A wrapper type for entitites that somehow refer to a location in the source +/// code. For example, a [`DataType`](arrow_schema::DataType) can be tied to an +/// expression which is being planned for execution, and the expression is in +/// turn located somewhere in the source code. +pub struct WithSpan { + pub span: Span, + pub inner: T, +} + +impl WithSpan { + pub fn into_inner(self) -> T { + self.inner + } + + pub fn new(inner: T, span: Span) -> Self { + Self { span, inner } + } + + pub fn new_without_span(inner: T) -> Self { + Self { + span: Span::empty(), + inner, + } + } +} + +impl From for WithSpan { + fn from(inner: T) -> Self { + Self::new_without_span(inner) + } +} + +impl Deref for WithSpan { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl DerefMut for WithSpan { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +impl AsRef for WithSpan { + fn as_ref(&self) -> &T { + &self.inner + } +} + +impl PartialEq for WithSpan { + fn eq(&self, other: &Self) -> bool { + self.inner.eq(&other.inner) + } +} + +impl Eq for WithSpan {} + +impl PartialOrd for WithSpan { + fn partial_cmp(&self, other: &Self) -> Option { + self.inner.partial_cmp(&other.inner) + } +} + +impl Ord for WithSpan { + fn cmp(&self, other: &Self) -> Ordering { + self.inner.cmp(&other.inner) + } +} + +impl Debug for WithSpan { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.inner.fmt(f) + } +} + +impl Display for WithSpan { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.inner.fmt(f) + } +} + +impl Clone for WithSpan { + fn clone(&self) -> Self { + Self { + span: self.span, + inner: self.inner.clone(), + } + } +} diff --git a/datafusion/core/tests/execution/logical_plan.rs b/datafusion/core/tests/execution/logical_plan.rs index 168bf484e541..b405783d17a9 100644 --- a/datafusion/core/tests/execution/logical_plan.rs +++ b/datafusion/core/tests/execution/logical_plan.rs @@ -25,6 +25,7 @@ use datafusion_expr::logical_plan::{LogicalPlan, Values}; use datafusion_expr::{Aggregate, AggregateUDF, Expr}; use datafusion_functions_aggregate::count::Count; use datafusion_physical_plan::collect; +use sqlparser::tokenizer::Span; use std::collections::HashMap; use std::fmt::Debug; use std::ops::Deref; @@ -51,6 +52,7 @@ async fn count_only_nulls() -> Result<()> { let input_col_ref = Expr::Column(Column { relation: None, name: "col".to_string(), + span: Span::empty(), }); // Aggregation: count(col) AS count diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml index 109d8e0b89a6..7f2183dc2892 100644 --- a/datafusion/expr-common/Cargo.toml +++ b/datafusion/expr-common/Cargo.toml @@ -41,3 +41,4 @@ arrow = { workspace = true } datafusion-common = { workspace = true } itertools = { workspace = true } paste = "^1.0" +sqlparser = { workspace = true } diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 4b2f3b5e46b5..9e392c5dbd13 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -30,9 +30,11 @@ use arrow::datatypes::{ }; use datafusion_common::types::NativeType; use datafusion_common::{ - exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, Result, + exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, + Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, WithSpan, }; use itertools::Itertools; +use sqlparser::tokenizer::Span; /// The type signature of an instantiation of binary operator expression such as /// `lhs + rhs` @@ -69,10 +71,19 @@ impl Signature { } /// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs` -fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result { +fn signature<'a>( + lhs: impl Into>, + op: impl Into>, + rhs: impl Into>, +) -> Result { use arrow::datatypes::DataType::*; use Operator::*; - match op { + + let lhs: WithSpan<_> = lhs.into(); + let op: WithSpan<_> = op.into(); + let rhs: WithSpan<_> = rhs.into(); + + match *op { Eq | NotEq | Lt | @@ -81,13 +92,13 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result GtEq | IsDistinctFrom | IsNotDistinctFrom => { - comparison_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + comparison_coercion(&lhs, &rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( "Cannot infer common argument type for comparison operation {lhs} {op} {rhs}" ) }) } - And | Or => if matches!((lhs, rhs), (Boolean | Null, Boolean | Null)) { + And | Or => if matches!((*lhs, *rhs), (Boolean | Null, Boolean | Null)) { // Logical binary boolean operators can only be evaluated for // boolean or null arguments. Ok(Signature::uniform(Boolean)) @@ -97,28 +108,28 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result ) } RegexMatch | RegexIMatch | RegexNotMatch | RegexNotIMatch => { - regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + regex_coercion(lhs.as_ref(), rhs.as_ref()).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( "Cannot infer common argument type for regex operation {lhs} {op} {rhs}" ) }) } LikeMatch | ILikeMatch | NotLikeMatch | NotILikeMatch => { - regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + regex_coercion(lhs.as_ref(), rhs.as_ref()).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( "Cannot infer common argument type for regex operation {lhs} {op} {rhs}" ) }) } BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseShiftRight | BitwiseShiftLeft => { - bitwise_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { + bitwise_coercion(lhs.as_ref(), rhs.as_ref()).map(Signature::uniform).ok_or_else(|| { plan_datafusion_err!( "Cannot infer common type for bitwise operation {lhs} {op} {rhs}" ) }) } StringConcat => { - string_concat_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { + string_concat_coercion(lhs.as_ref(), rhs.as_ref()).map(Signature::uniform).ok_or_else(|| { plan_datafusion_err!( "Cannot infer common string type for string concat operation {lhs} {op} {rhs}" ) @@ -128,7 +139,7 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result // ArrowAt and AtArrow check for whether one array is contained in another. // The result type is boolean. Signature::comparison defines this signature. // Operation has nothing to do with comparison - array_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + array_coercion(lhs.as_ref(), rhs.as_ref()).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( "Cannot infer common array type for arrow operation {lhs} {op} {rhs}" ) @@ -140,7 +151,7 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result let l = new_empty_array(lhs); let r = new_empty_array(rhs); - let result = match op { + let result = match *op { Plus => add_wrapping(&l, &r), Minus => sub_wrapping(&l, &r), Multiply => mul_wrapping(&l, &r), @@ -151,14 +162,14 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result result.map(|x| x.data_type().clone()) }; - if let Ok(ret) = get_result(lhs, rhs) { + if let Ok(ret) = get_result(lhs.as_ref(), rhs.as_ref()) { // Temporal arithmetic, e.g. Date32 + Interval Ok(Signature{ - lhs: lhs.clone(), - rhs: rhs.clone(), + lhs: lhs.into_inner().clone(), + rhs: rhs.into_inner().clone(), ret, }) - } else if let Some(coerced) = temporal_coercion_strict_timezone(lhs, rhs) { + } else if let Some(coerced) = temporal_coercion_strict_timezone(lhs.as_ref(), rhs.as_ref()) { // Temporal arithmetic by first coercing to a common time representation // e.g. Date32 - Timestamp let ret = get_result(&coerced, &coerced).map_err(|e| { @@ -171,7 +182,7 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result rhs: coerced, ret, }) - } else if let Some((lhs, rhs)) = math_decimal_coercion(lhs, rhs) { + } else if let Some((lhs, rhs)) = math_decimal_coercion(lhs.as_ref(), rhs.as_ref()) { // Decimal arithmetic, e.g. Decimal(10, 2) + Decimal(10, 0) let ret = get_result(&lhs, &rhs).map_err(|e| { plan_datafusion_err!( @@ -183,32 +194,56 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result rhs, ret, }) - } else if let Some(numeric) = mathematics_numerical_coercion(lhs, rhs) { + } else if let Some(numeric) = mathematics_numerical_coercion(lhs.as_ref(), rhs.as_ref()) { // Numeric arithmetic, e.g. Int32 + Int32 Ok(Signature::uniform(numeric)) } else { plan_err!( "Cannot coerce arithmetic expression {lhs} {op} {rhs} to valid types" - ) + ).map_err(|err | { + if lhs.span == Span::empty() || rhs.span == Span::empty() { + err + } else { + err.with_diagnostic(Diagnostic { + entries: vec![ + DiagnosticEntry { + kind: DiagnosticEntryKind::Error, + message: "Binary expression has invalid types".to_string(), + span: lhs.span.union(&rhs.span), + }, + DiagnosticEntry { + kind: DiagnosticEntryKind::Note, + message: format!("Is of type {}", *lhs), + span: lhs.span, + }, + DiagnosticEntry { + kind: DiagnosticEntryKind::Note, + message: format!("Is of type {}", *rhs), + span: rhs.span, + } + ] + }) + } + }) } } } } /// Returns the resulting type of a binary expression evaluating the `op` with the left and right hand types -pub fn get_result_type( - lhs: &DataType, - op: &Operator, - rhs: &DataType, +pub fn get_result_type<'a>( + lhs: impl Into>, + op: impl Into>, + rhs: impl Into>, ) -> Result { signature(lhs, op, rhs).map(|sig| sig.ret) } /// Returns the coerced input types for a binary expression evaluating the `op` with the left and right hand types -pub fn get_input_types( - lhs: &DataType, - op: &Operator, - rhs: &DataType, +pub fn get_input_types<'a>( + lhs: impl Into>, + op: impl Into>, + rhs: impl Into>, ) -> Result<(DataType, DataType)> { signature(lhs, op, rhs).map(|sig| (sig.lhs, sig.rhs)) } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index d0e98327f34b..c5eaa4bf88e4 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -42,6 +42,7 @@ use sqlparser::ast::{ display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, NullTreatment, RenameSelectItem, ReplaceSelectElement, }; +use sqlparser::tokenizer::Span; /// Represents logical expressions such as `A + 1`, or `CAST(c1 AS int)`. /// @@ -1113,7 +1114,7 @@ impl Expr { /// output schema. We can use this qualified name to reference the field. pub fn qualified_name(&self) -> (Option, String) { match self { - Expr::Column(Column { relation, name }) => (relation.clone(), name.clone()), + Expr::Column(Column { relation, name, span: _ }) => (relation.clone(), name.clone()), Expr::Alias(Alias { relation, name, .. }) => (relation.clone(), name.clone()), _ => (None, self.schema_name().to_string()), } @@ -1665,6 +1666,13 @@ impl Expr { | Expr::Placeholder(..) => false, } } + + pub fn get_span(&self) -> Option { + match self { + Expr::Column(Column{span, ..}) => Some(*span), + _ => None, + } + } } impl HashNode for Expr { diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index b944428977c4..c32f217c933b 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -160,6 +160,7 @@ pub fn unnormalize_col(expr: Expr) -> Expr { let col = Column { relation: None, name: c.name, + span: c.span, }; Transformed::yes(Expr::Column(col)) } else { @@ -181,10 +182,10 @@ pub fn create_col_from_scalar_expr( Some::(subqry_alias.into()), name, )), - Expr::Column(Column { relation: _, name }) => Ok(Column::new( + Expr::Column(Column { relation: _, name, span }) => Ok(Column::new( Some::(subqry_alias.into()), name, - )), + ).with_span(*span)), _ => { let scalar_column = scalar_expr.schema_name().to_string(); Ok(Column::new( diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index f0d3d8fcd0c1..c49197f78eb9 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -23,6 +23,7 @@ use crate::{expr::Sort, Cast, Expr, LogicalPlan, TryCast}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{Column, Result}; +use sqlparser::tokenizer::Span; /// Rewrite sort on aggregate expressions to sort on the column of aggregate output /// For example, `max(x)` is written to `col("max(x)")` @@ -101,6 +102,8 @@ fn rewrite_in_terms_of_projection( let search_col = Expr::Column(Column { relation: None, name, + // Span is not used in PartialEq + span: Span::empty(), }); // look for the column named the same as this expr diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index b1a461eca41d..88dd74ded7d5 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -29,7 +29,7 @@ use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ not_impl_err, plan_datafusion_err, plan_err, Column, ExprSchema, Result, - TableReference, + TableReference, WithSpan, }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use recursive::recursive; @@ -403,9 +403,26 @@ impl ExprSchemable for Expr { ref right, ref op, }) => { - let left = left.data_type_and_nullable(schema)?; - let right = right.data_type_and_nullable(schema)?; - Ok((get_result_type(&left.0, op, &right.0)?, left.1 || right.1)) + let (left_type, left_is_nullable) = + left.data_type_and_nullable(schema)?; + let left_type = if let Some(span) = left.get_span() { + WithSpan::new(&left_type, span) + } else { + (&left_type).into() + }; + + let (right_type, right_is_nullable) = + right.data_type_and_nullable(schema)?; + let right_type = if let Some(span) = right.get_span() { + WithSpan::new(&right_type, span) + } else { + (&right_type).into() + }; + + Ok(( + get_result_type(left_type, op, right_type)?, + left_is_nullable || right_is_nullable, + )) } Expr::WindowFunction(window_function) => { self.data_type_and_nullable_with_window_function(schema, window_function) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 90235e3f84c4..4a8c5d08e284 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1967,6 +1967,7 @@ mod tests { use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery}; use datafusion_common::{RecursionUnnestOption, SchemaError}; + use sqlparser::tokenizer::Span; #[test] fn plan_builder_simple() -> Result<()> { @@ -2211,6 +2212,7 @@ mod tests { Column { relation: Some(TableReference::Bare { table }), name, + span: _, }, }, _, diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs index 9fbe54e1ccb9..1853f0aaf3ec 100644 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -123,21 +123,29 @@ fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { Expr::Column(Column { ref relation, ref name, + span, }) => { if name.eq("*") { - if let Some(qualifier) = relation { - projected_expr.extend(expand_qualified_wildcard( + let expanded_columns = if let Some(qualifier) = relation { + expand_qualified_wildcard( qualifier, input.schema(), None, - )?); + )? } else { - projected_expr.extend(expand_wildcard( + expand_wildcard( input.schema(), input, None, - )?); - } + )? + }; + + projected_expr.extend(expanded_columns.into_iter().map(|mut expr| { + if let Expr::Column(c) = &mut expr { + c.span = span; + } + expr + })); } else { projected_expr.push(e.clone()); } diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 58e390ee8bdb..1bd4009b47cc 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1000,6 +1000,7 @@ fn project_with_column_index( Expr::Column(Column { relation: _, ref name, + span: _, }) if name != schema.field(i).name() => e.alias(schema.field(i).name()), Expr::Alias { .. } | Expr::Column { .. } => e, _ => e.alias(schema.field(i).name()), diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index b5726d999137..5ebd73799dd5 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -532,7 +532,7 @@ fn proj_exprs_evaluation_result_on_empty_batch( let result_expr = simplifier.simplify(result_expr)?; let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), - Expr::Column(Column { relation: _, name }) => name.to_string(), + Expr::Column(Column { relation: _, name, span: _ }) => name.to_string(), _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 9adf14459081..d889d5e20152 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -46,6 +46,8 @@ impl SqlToRel<'_, S> { })?; Ok(Expr::ScalarVariable(ty, var_names)) } else { + let id_span = id.span; + // Don't use `col()` here because it will try to // interpret names with '.' as if they were // compound identifiers, but this is not a compound @@ -59,6 +61,7 @@ impl SqlToRel<'_, S> { return Ok(Expr::Column(Column { relation: qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(), name: normalize_ident, + span: id_span, })); } @@ -79,6 +82,7 @@ impl SqlToRel<'_, S> { Ok(Expr::Column(Column { relation: None, name: normalize_ident, + span: id_span, })) } } diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index fb83ecea88b8..68282782575c 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -283,10 +283,10 @@ impl<'a> DFParser<'a> { dialect: &'a dyn Dialect, ) -> Result { let mut tokenizer = Tokenizer::new(dialect, sql); - let tokens = tokenizer.tokenize()?; + let tokens = tokenizer.tokenize_with_location()?; Ok(DFParser { - parser: Parser::new(dialect).with_tokens(tokens), + parser: Parser::new(dialect).with_tokens_with_locations(tokens), }) } diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 88b2e79558ca..3ddec1a87082 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1695,6 +1695,7 @@ mod tests { Expr::Column(Column { relation: Some(TableReference::partial("a", "b")), name: "c".to_string(), + span: Span::empty(), }) .gt(lit(4)), r#"(b.c > 4)"#, @@ -2032,6 +2033,7 @@ mod tests { expr: Box::new(Expr::Column(Column { relation: Some(TableReference::partial("schema", "table")), name: "array_col".to_string(), + span: Span::empty(), })), }), r#"UNNEST("table".array_col)"#, diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 1a1d4c42b0ff..5f015c6465fc 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -911,10 +911,12 @@ impl Unparser<'_> { Expr::Column(Column { relation: _, name: left_name, + span: _, }), Expr::Column(Column { relation: _, name: right_name, + span: _, }), ) if left_name == right_name => { idents.push(self.new_ident_quoted_if_needs(left_name.to_string())); From dee7959dbaa6d233092e0d80c40ebedb40317a06 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 6 Dec 2024 14:36:25 +0100 Subject: [PATCH 4/8] feat: diagnostics for unknown columns and missing in `GROUP BY` --- datafusion/common/src/diagnostic.rs | 39 ++++++++++++- datafusion/common/src/error.rs | 10 +++- .../expr-common/src/type_coercion/binary.rs | 2 +- datafusion/sql/src/expr/identifier.rs | 20 +++++-- datafusion/sql/src/planner.rs | 25 ++++++++- datafusion/sql/src/select.rs | 21 +++++-- datafusion/sql/src/utils.rs | 55 +++++++++++++++---- 7 files changed, 145 insertions(+), 27 deletions(-) diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 1ebf3f9ed566..130cbee0d9a1 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -12,10 +12,47 @@ pub struct DiagnosticEntry { pub kind: DiagnosticEntryKind, } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DiagnosticEntryKind { Error, Warning, Note, Help, } + +impl Diagnostic { + pub fn new(entries: impl IntoIterator) -> Self { + Diagnostic { + entries: entries.into_iter().collect(), + } + } +} + +impl FromIterator for Diagnostic { + fn from_iter>(iter: T) -> Self { + Diagnostic { + entries: iter.into_iter().collect(), + } + } +} + +impl DiagnosticEntry { + pub fn new( + message: impl Into, + kind: DiagnosticEntryKind, + span: Span, + ) -> Self { + DiagnosticEntry { + span, + message: message.into(), + kind, + } + } + + pub fn new_without_span( + message: impl Into, + kind: DiagnosticEntryKind, + ) -> Self { + Self::new(message, kind, Span::empty()) + } +} diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 39916e377aae..f1a816ceac29 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -28,7 +28,7 @@ use std::sync::Arc; use crate::diagnostic::Diagnostic; use crate::utils::quote_identifier; -use crate::{Column, DFSchema, TableReference}; +use crate::{Column, DFSchema, DiagnosticEntry, DiagnosticEntryKind, TableReference}; #[cfg(feature = "avro")] use apache_avro::Error as AvroError; use arrow::error::ArrowError; @@ -376,8 +376,12 @@ impl DataFusionError { } /// wraps self in Self::Diagnostic with a [`Diagnostic`] - pub fn with_diagnostic(self, diag: Diagnostic) -> Self { - Self::Diagnostic(diag, Box::new(self)) + pub fn with_diagnostic Diagnostic>( + self, + f: F, + ) -> Self { + let diagnostic = f(&self); + Self::Diagnostic(diagnostic, Box::new(self)) } /// Strips backtrace out of the error message diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 9e392c5dbd13..954807eae226 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -204,7 +204,7 @@ fn signature<'a>( if lhs.span == Span::empty() || rhs.span == Span::empty() { err } else { - err.with_diagnostic(Diagnostic { + err.with_diagnostic(|_| Diagnostic { entries: vec![ DiagnosticEntry { kind: DiagnosticEntryKind::Error, diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index d889d5e20152..14c7bb8f51c1 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -20,10 +20,12 @@ use sqlparser::ast::{Expr as SQLExpr, Ident}; use datafusion_common::{ internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, - DataFusionError, Result, TableReference, + DataFusionError, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, + TableReference, }; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{Case, Expr}; +use sqlparser::tokenizer::Span; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_expr::UNNAMED_TABLE; @@ -112,6 +114,7 @@ impl SqlToRel<'_, S> { })?; Ok(Expr::ScalarVariable(ty, var_names)) } else { + let span = ids.last().map(|id| id.span).unwrap_or(Span::empty()); let ids = ids .into_iter() .map(|id| self.ident_normalizer.normalize(id)) @@ -139,9 +142,9 @@ impl SqlToRel<'_, S> { plan_err!("could not parse compound identifier from {ids:?}") } // Found matching field with no spare identifier(s) - Some((field, qualifier, _nested_names)) => { - Ok(Expr::Column(Column::from((qualifier, field)))) - } + Some((field, qualifier, _nested_names)) => Ok(Expr::Column( + Column::from((qualifier, field)).with_span(span), + )), None => { // Return default where use all identifiers to not have a nested field // this len check is because at 5 identifiers will have to have a nested field @@ -176,14 +179,19 @@ impl SqlToRel<'_, S> { // safe unwrap as s can never be empty or exceed the bounds let (relation, column_name) = form_identifier(s).unwrap(); - Ok(Expr::Column(Column::new(relation, column_name))) + Ok(Expr::Column( + Column::new(relation, column_name) + .with_span(span), + )) } } } else { let s = &ids[0..ids.len()]; // Safe unwrap as s can never be empty or exceed the bounds let (relation, column_name) = form_identifier(s).unwrap(); - Ok(Expr::Column(Column::new(relation, column_name))) + Ok(Expr::Column( + Column::new(relation, column_name).with_span(span), + )) } } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 6ef2eb4e7fb2..0b1481544d05 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -22,7 +22,8 @@ use std::vec; use arrow_schema::*; use datafusion_common::{ - field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError, + field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, Diagnostic, + DiagnosticEntry, DiagnosticEntryKind, SchemaError, }; use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; @@ -383,7 +384,20 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .try_for_each(|col| match col { Expr::Column(col) => match &col.relation { Some(r) => { - schema.field_with_qualified_name(r, &col.name)?; + schema.field_with_qualified_name(r, &col.name).map_err( + |err| { + err.with_diagnostic(|_| { + Diagnostic::new([DiagnosticEntry::new( + format!( + "No field named '{}' in relation '{}'", + col.name, r + ), + DiagnosticEntryKind::Error, + col.span, + )]) + }) + }, + )?; Ok(()) } None => { @@ -396,6 +410,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } .map_err(|_: DataFusionError| { field_not_found(col.relation.clone(), col.name.as_str(), schema) + .with_diagnostic(|_| { + Diagnostic::new([DiagnosticEntry::new( + format!("No field named '{}'", col.name), + DiagnosticEntryKind::Error, + col.span, + )]) + }) }), _ => internal_err!("Not a column"), }) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 9899d92bde71..6d1b3f185ae1 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -22,10 +22,13 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::{ check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs, resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, + CheckColumnsSatisfyExprsPurpose, }; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::{not_impl_err, plan_err, Result}; +use datafusion_common::{ + not_impl_err, plan_err, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, +}; use datafusion_common::{RecursionUnnestOption, UnnestOptions}; use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; use datafusion_expr::expr_rewriter::{ @@ -44,6 +47,7 @@ use sqlparser::ast::{ WildcardAdditionalOptions, WindowType, }; use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins}; +use sqlparser::tokenizer::Span; impl SqlToRel<'_, S> { /// Generate a logic plan from an SQL select @@ -803,8 +807,17 @@ impl SqlToRel<'_, S> { check_columns_satisfy_exprs( &column_exprs_post_aggr, &select_exprs_post_aggr, - "Projection references non-aggregate values", - )?; + CheckColumnsSatisfyExprsPurpose::GroupBy, + ) + .map_err(|err| { + err.with_diagnostic(|_| { + Diagnostic::new([DiagnosticEntry::new( + "GROUP BY clause is here", + DiagnosticEntryKind::Note, + Span::union_iter(group_by_exprs.iter().filter_map(|e| e.get_span())), + )]) + }) + })?; // Rewrite the HAVING expression to use the columns produced by the // aggregation. @@ -815,7 +828,7 @@ impl SqlToRel<'_, S> { check_columns_satisfy_exprs( &column_exprs_post_aggr, std::slice::from_ref(&having_expr_post_aggr), - "HAVING clause references non-aggregate values", + CheckColumnsSatisfyExprsPurpose::Having, )?; Some(having_expr_post_aggr) diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index e29a2299b41c..b2133d463f87 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -26,8 +26,8 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, }; use datafusion_common::{ - exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, HashMap, - Result, ScalarValue, + exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, Diagnostic, + DiagnosticEntry, DiagnosticEntryKind, HashMap, Result, ScalarValue, }; use datafusion_expr::builder::get_struct_unnested_columns; use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; @@ -37,6 +37,7 @@ use datafusion_expr::{ }; use indexmap::IndexMap; use sqlparser::ast::{Ident, Value}; +use sqlparser::tokenizer::Span; /// Make a best-effort attempt at resolving all columns in the expression tree pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { @@ -89,12 +90,29 @@ pub(crate) fn rebase_expr( .data() } +/// Why [`check_columns_satisfy_exprs`] is called. Helps build better error +/// messages. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CheckColumnsSatisfyExprsPurpose { + GroupBy, + Having, +} + +impl CheckColumnsSatisfyExprsPurpose { + fn get_error_message(&self) -> &'static str { + match self { + CheckColumnsSatisfyExprsPurpose::GroupBy => "Projection references non-aggregate values", + CheckColumnsSatisfyExprsPurpose::Having => "HAVING clause references non-aggregate values", + } + } +} + /// Determines if the set of `Expr`'s are a valid projection on the input /// `Expr::Column`'s. pub(crate) fn check_columns_satisfy_exprs( columns: &[Expr], exprs: &[Expr], - message_prefix: &str, + call_purpose: CheckColumnsSatisfyExprsPurpose, ) -> Result<()> { columns.iter().try_for_each(|c| match c { Expr::Column(_) => Ok(()), @@ -105,22 +123,22 @@ pub(crate) fn check_columns_satisfy_exprs( match e { Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, call_purpose)?; } } Expr::GroupingSet(GroupingSet::Cube(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, call_purpose)?; } } Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { for exprs in lists_of_exprs { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, call_purpose)?; } } } - _ => check_column_satisfies_expr(columns, e, message_prefix)?, + _ => check_column_satisfies_expr(columns, e, call_purpose)?, } } Ok(()) @@ -129,15 +147,32 @@ pub(crate) fn check_columns_satisfy_exprs( fn check_column_satisfies_expr( columns: &[Expr], expr: &Expr, - message_prefix: &str, + call_purpose: CheckColumnsSatisfyExprsPurpose, ) -> Result<()> { if !columns.contains(expr) { return plan_err!( "{}: Expression {} could not be resolved from available columns: {}", - message_prefix, + call_purpose.get_error_message(), expr, expr_vec_fmt!(columns) - ); + ) + .map_err(|err| { + let message = match call_purpose { + CheckColumnsSatisfyExprsPurpose::GroupBy => format!("'{}' in projection does not appear in GROUP BY clause", expr.to_string()), + CheckColumnsSatisfyExprsPurpose::Having => format!("'{}' in HAVING clause does not appear in GROUP BY clause", expr.to_string()), + }; + err.with_diagnostic(|_| { + Diagnostic::new([DiagnosticEntry::new( + message, + DiagnosticEntryKind::Error, + expr.get_span().unwrap_or(Span::empty()), + ),DiagnosticEntry::new( + format!("Add '{}' to GROUP BY clause", expr.to_string()), + DiagnosticEntryKind::Help, + Span::empty(), + )]) + }) + }); } Ok(()) } From 515bb5e08453f0a69308f1f149d791f117d33db7 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Mon, 9 Dec 2024 15:07:38 +0100 Subject: [PATCH 5/8] feat: `FieldSpans` and ambiguous column reference diagnostics --- datafusion/common/src/column.rs | 62 ++++++++-- .../common/src/dfschema/fields_spans.rs | 42 +++++++ .../src/{dfschema.rs => dfschema/mod.rs} | 112 +++++++++++++----- datafusion/common/src/lib.rs | 4 +- datafusion/common/src/with_span.rs | 69 +++++------ datafusion/core/src/dataframe/mod.rs | 4 +- datafusion/core/src/physical_planner.rs | 8 +- .../expr-common/src/type_coercion/binary.rs | 38 +++--- datafusion/expr/src/expr.rs | 21 +++- datafusion/expr/src/expr_rewriter/mod.rs | 18 ++- datafusion/expr/src/expr_rewriter/order_by.rs | 2 +- datafusion/expr/src/expr_schema.rs | 10 +- datafusion/expr/src/logical_plan/builder.rs | 22 ++-- datafusion/expr/src/logical_plan/plan.rs | 14 ++- datafusion/expr/src/utils.rs | 5 +- .../src/analyzer/expand_wildcard_rule.rs | 4 +- .../optimizer/src/analyzer/type_coercion.rs | 6 +- .../optimizer/src/common_subexpr_eliminate.rs | 7 +- datafusion/optimizer/src/decorrelate.rs | 2 +- datafusion/optimizer/src/push_down_filter.rs | 8 +- .../src/replace_distinct_aggregate.rs | 2 +- datafusion/sql/src/expr/identifier.rs | 4 +- datafusion/sql/src/expr/mod.rs | 4 +- datafusion/sql/src/planner.rs | 5 +- datafusion/sql/src/select.rs | 6 +- datafusion/sql/src/statement.rs | 2 +- datafusion/sql/src/unparser/plan.rs | 4 +- datafusion/sql/src/utils.rs | 41 +++++-- 28 files changed, 353 insertions(+), 173 deletions(-) create mode 100644 datafusion/common/src/dfschema/fields_spans.rs rename datafusion/common/src/{dfschema.rs => dfschema/mod.rs} (94%) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index fe29a24147c3..0ddd1f0a4112 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -22,7 +22,10 @@ use sqlparser::tokenizer::Span; use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; -use crate::{DFSchema, Result, SchemaError, TableReference}; +use crate::{ + DFSchema, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, SchemaError, + TableReference, +}; use derivative::Derivative; use std::collections::HashSet; use std::convert::Infallible; @@ -46,7 +49,7 @@ pub struct Column { /// The location in the source code where this column originally came from. /// Does not change as the [`Column`] undergoes normalization and other /// transformations. - pub span: Span, + pub spans: Vec, } impl Column { @@ -63,7 +66,7 @@ impl Column { Self { relation: relation.map(|r| r.into()), name: name.into(), - span: Span::empty(), + spans: vec![], } } @@ -72,7 +75,7 @@ impl Column { Self { relation: None, name: name.into(), - span: Span::empty(), + spans: vec![], } } @@ -83,7 +86,7 @@ impl Column { Self { relation: None, name: name.into(), - span: Span::empty(), + spans: vec![], } } @@ -118,7 +121,7 @@ impl Column { Some(Self { relation, name, - span: Span::empty(), + spans: vec![], }) } @@ -133,7 +136,7 @@ impl Column { Self { relation: None, name: flat_name, - span: Span::empty(), + spans: vec![], }, ) } @@ -145,7 +148,7 @@ impl Column { Self { relation: None, name: flat_name, - span: Span::empty(), + spans: vec![], }, ) } @@ -251,7 +254,8 @@ impl Column { .flat_map(|s| s.columns_with_unqualified_name(&self.name)) .collect::>(); for using_col in using_columns { - let all_matched = columns.iter().all(|c| using_col.contains(c)); + let all_matched = + columns.iter().all(|c| using_col.contains(c)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifier from the first match. if all_matched { @@ -261,7 +265,33 @@ impl Column { // If not due to USING columns then due to ambiguous column name return _schema_err!(SchemaError::AmbiguousReference { - field: Column::new_unqualified(self.name), + field: Column::new_unqualified(&self.name), + }) + .map_err(|err| { + err.with_diagnostic(|_| { + Diagnostic::new( + [DiagnosticEntry::new( + format!("Ambiguous reference to '{}'", &self.name), + DiagnosticEntryKind::Error, + *self.spans.first().unwrap_or(&Span::empty()), + )] + .into_iter() + .chain( + columns.iter().flat_map(|c| { + c.spans.iter().map(|span| { + DiagnosticEntry::new( + format!( + "Possible reference to '{}' here", + &c.name + ), + DiagnosticEntryKind::Note, + *span, + ) + }) + }), + ), + ) + }) }); } } @@ -280,7 +310,17 @@ impl Column { /// Attaches a [`Span`] to the [`Column`], i.e. its location in the source /// SQL query. pub fn with_span(mut self, span: Span) -> Self { - self.span = span; + self.spans = vec![span]; + self + } + + /// Attaches multiple [`Span`]s to the [`Column`], i.e. its locations in the + /// source SQL query. + pub fn with_spans(mut self, spans: IT) -> Self + where + IT: IntoIterator, + { + self.spans = spans.into_iter().collect(); self } } diff --git a/datafusion/common/src/dfschema/fields_spans.rs b/datafusion/common/src/dfschema/fields_spans.rs new file mode 100644 index 000000000000..bbfa9f2783a7 --- /dev/null +++ b/datafusion/common/src/dfschema/fields_spans.rs @@ -0,0 +1,42 @@ +use sqlparser::tokenizer::Span; + +use crate::JoinType; + +/// The location in the source code where the fields are defined (e.g. in the +/// body of a CTE), if any. +#[derive(Debug, Clone)] +pub struct FieldsSpans(Vec>); + +impl FieldsSpans { + pub fn empty(field_count: usize) -> Self { + Self((0..field_count).into_iter().map(|_| Vec::new()).collect()) + } + + pub fn iter(&self) -> impl Iterator> { + self.0.iter() + } + + pub fn join( + &self, + other: &FieldsSpans, + join_type: &JoinType, + left_cols_len: usize, + ) -> FieldsSpans { + match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + FieldsSpans(self.0.iter().chain(other.0.iter()).cloned().collect()) + } + JoinType::LeftSemi => todo!(), + JoinType::RightSemi => todo!(), + JoinType::LeftAnti => todo!(), + JoinType::RightAnti => todo!(), + JoinType::LeftMark => todo!(), + } + } +} + +impl FromIterator> for FieldsSpans { + fn from_iter>>(iter: T) -> Self { + FieldsSpans(iter.into_iter().collect()) + } +} diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema/mod.rs similarity index 94% rename from datafusion/common/src/dfschema.rs rename to datafusion/common/src/dfschema/mod.rs index b53b475edaa0..f326cded1ac1 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema/mod.rs @@ -17,7 +17,10 @@ //! DFSchema is an extended schema struct that DataFusion uses to provide support for //! fields with optional relation names. +mod fields_spans; +pub use fields_spans::FieldsSpans; +use std::backtrace::Backtrace; use std::collections::{BTreeSet, HashMap, HashSet}; use std::fmt::{Display, Formatter}; use std::hash::Hash; @@ -28,10 +31,10 @@ use crate::{ field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, SchemaError, TableReference, }; - use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; use arrow_schema::SchemaBuilder; +use derivative::Derivative; use sqlparser::tokenizer::Span; /// A reference-counted reference to a [DFSchema]. @@ -105,7 +108,8 @@ pub type DFSchemaRef = Arc; /// let schema = Schema::from(df_schema); /// assert_eq!(schema.fields().len(), 1); /// ``` -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Derivative)] +#[derivative(PartialEq, Eq)] pub struct DFSchema { /// Inner Arrow schema reference. inner: SchemaRef, @@ -114,6 +118,10 @@ pub struct DFSchema { field_qualifiers: Vec>, /// Stores functional dependencies in the schema. functional_dependencies: FunctionalDependencies, + /// The location in the source code where the fields are defined (e.g. in + /// the body of a CTE), if any. + #[derivative(PartialEq = "ignore")] + fields_spans: FieldsSpans, } impl DFSchema { @@ -123,6 +131,7 @@ impl DFSchema { inner: Arc::new(Schema::new([])), field_qualifiers: vec![], functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(0), } } @@ -148,12 +157,15 @@ impl DFSchema { let (qualifiers, fields): (Vec>, Vec>) = qualified_fields.into_iter().unzip(); + let field_count = fields.len(); + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); let dfschema = Self { inner: schema, field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(field_count), }; dfschema.check_names()?; Ok(dfschema) @@ -183,6 +195,7 @@ impl DFSchema { inner: schema, field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(field_count), }; dfschema.check_names()?; Ok(dfschema) @@ -201,6 +214,7 @@ impl DFSchema { inner: schema.clone().into(), field_qualifiers: vec![Some(qualifier); schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(schema.fields.len()), }; schema.check_names()?; Ok(schema) @@ -211,10 +225,12 @@ impl DFSchema { qualifiers: Vec>, schema: &SchemaRef, ) -> Result { + let field_count = schema.fields.len(); let dfschema = Self { inner: Arc::clone(schema), field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(field_count), }; dfschema.check_names()?; Ok(dfschema) @@ -266,6 +282,11 @@ impl DFSchema { } } + pub fn with_fields_spans(mut self, fields_spans: FieldsSpans) -> Self { + self.fields_spans = fields_spans; + self + } + /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { @@ -277,6 +298,7 @@ impl DFSchema { let mut new_metadata = self.inner.metadata.clone(); new_metadata.extend(schema.inner.metadata.clone()); let new_schema_with_metadata = new_schema.with_metadata(new_metadata); + let new_field_count = new_schema_with_metadata.fields.len(); let mut new_qualifiers = self.field_qualifiers.clone(); new_qualifiers.extend_from_slice(schema.field_qualifiers.as_slice()); @@ -285,6 +307,7 @@ impl DFSchema { inner: Arc::new(new_schema_with_metadata), field_qualifiers: new_qualifiers, functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(new_field_count), }; new_self.check_names()?; Ok(new_self) @@ -297,8 +320,10 @@ impl DFSchema { return; } - let self_fields: HashSet<(Option<&TableReference>, &FieldRef)> = - self.iter().collect(); + let self_fields: HashSet<(Option<&TableReference>, &FieldRef)> = self + .iter() + .map(|(qualifier, field, _)| (qualifier, field)) + .collect(); let self_unqualified_names: HashSet<&str> = self .inner .fields @@ -308,7 +333,7 @@ impl DFSchema { let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); let mut qualifiers = Vec::new(); - for (qualifier, field) in other_schema.iter() { + for (qualifier, field, _) in other_schema.iter() { // skip duplicate columns let duplicated_field = match qualifier { Some(q) => self_fields.contains(&(Some(q), field)), @@ -354,7 +379,7 @@ impl DFSchema { let mut matches = self .iter() .enumerate() - .filter(|(_, (q, f))| match (qualifier, q) { + .filter(|(_, (q, f, _))| match (qualifier, q) { // field to lookup is qualified. // current field is qualified and not shared between relations, compare both // qualifier and name. @@ -425,8 +450,8 @@ impl DFSchema { /// Find all fields having the given qualifier pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { self.iter() - .filter(|(q, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false)) - .map(|(_, f)| f.as_ref()) + .filter(|(q, _, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false)) + .map(|(_, f, _)| f.as_ref()) .collect() } @@ -437,7 +462,7 @@ impl DFSchema { ) -> Vec { self.iter() .enumerate() - .filter_map(|(idx, (q, _))| q.and_then(|q| q.eq(qualifier).then_some(idx))) + .filter_map(|(idx, (q, _, _))| q.and_then(|q| q.eq(qualifier).then_some(idx))) .collect() } @@ -456,23 +481,26 @@ impl DFSchema { name: &str, ) -> Vec<(Option<&TableReference>, &Field)> { self.iter() - .filter(|(_, field)| field.name() == name) - .map(|(qualifier, field)| (qualifier, field.as_ref())) + .filter(|(_, field, _)| field.name() == name) + .map(|(qualifier, field, _)| (qualifier, field.as_ref())) .collect() } /// Find all fields that match the given name and convert to column pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { self.iter() - .filter(|(_, field)| field.name() == name) - .map(|(qualifier, field)| Column::new(qualifier.cloned(), field.name())) + .filter(|(_, field, _)| field.name() == name) + .map(|(qualifier, field, spans)| { + Column::new(qualifier.cloned(), field.name()) + .with_spans(spans.iter().copied()) + }) .collect() } /// Return all `Column`s for the schema pub fn columns(&self) -> Vec { self.iter() - .map(|(qualifier, field)| { + .map(|(qualifier, field, _)| { Column::new(qualifier.cloned(), field.name().clone()) }) .collect() @@ -506,7 +534,7 @@ impl DFSchema { field: Column { relation: None, name: name.to_string(), - span: Span::empty(), + spans: vec![], }, }) } @@ -560,8 +588,9 @@ impl DFSchema { qualifier: &TableReference, name: &str, ) -> bool { - self.iter() - .any(|(q, f)| q.map(|q| q.eq(qualifier)).unwrap_or(false) && f.name() == name) + self.iter().any(|(q, f, _)| { + q.map(|q| q.eq(qualifier)).unwrap_or(false) && f.name() == name + }) } /// Find if the field exists with the given qualified column @@ -615,11 +644,13 @@ impl DFSchema { } let self_fields = self.iter(); let other_fields = other.iter(); - self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { - q1 == q2 - && f1.name() == f2.name() - && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type()) - }) + self_fields + .zip(other_fields) + .all(|((q1, f1, _), (q2, f2, _))| { + q1 == q2 + && f1.name() == f2.name() + && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type()) + }) } /// Returns true if the two schemas have the same qualified named @@ -637,11 +668,16 @@ impl DFSchema { } let self_fields = self.iter(); let other_fields = other.iter(); - self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { - q1 == q2 - && f1.name() == f2.name() - && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) - }) + self_fields + .zip(other_fields) + .all(|((q1, f1, _), (q2, f2, _))| { + q1 == q2 + && f1.name() == f2.name() + && Self::datatype_is_semantically_equal( + f1.data_type(), + f2.data_type(), + ) + }) } /// Checks if two [`DataType`]s are logically equal. This is a notably weaker constraint @@ -781,6 +817,7 @@ impl DFSchema { field_qualifiers: vec![None; self.inner.fields.len()], inner: self.inner, functional_dependencies: self.functional_dependencies, + fields_spans: self.fields_spans, } } @@ -791,13 +828,14 @@ impl DFSchema { field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], inner: self.inner, functional_dependencies: self.functional_dependencies, + fields_spans: self.fields_spans, } } /// Get list of fully-qualified field names in this schema pub fn field_names(&self) -> Vec { self.iter() - .map(|(qualifier, field)| qualified_name(qualifier, field.name())) + .map(|(qualifier, field, _)| qualified_name(qualifier, field.name())) .collect::>() } @@ -811,12 +849,20 @@ impl DFSchema { &self.functional_dependencies } + /// Get fields spans + pub fn fields_spans(&self) -> &FieldsSpans { + &self.fields_spans + } + /// Iterate over the qualifiers and fields in the DFSchema - pub fn iter(&self) -> impl Iterator, &FieldRef)> { + pub fn iter( + &self, + ) -> impl Iterator, &FieldRef, &Vec)> { self.field_qualifiers .iter() .zip(self.inner.fields().iter()) - .map(|(qualifier, field)| (qualifier.as_ref(), field)) + .zip(self.fields_spans.iter()) + .map(|((qualifier, field), span)| (qualifier.as_ref(), field, span)) } } @@ -867,6 +913,7 @@ impl TryFrom for DFSchema { inner: schema, field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(field_count), }; Ok(dfschema) } @@ -923,6 +970,7 @@ impl ToDFSchema for Vec { inner: schema.into(), field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(field_count), }; Ok(dfschema) } @@ -934,7 +982,7 @@ impl Display for DFSchema { f, "fields:[{}], metadata:{:?}", self.iter() - .map(|(q, f)| qualified_name(q, f.name())) + .map(|(q, f, _)| qualified_name(q, f.name())) .collect::>() .join(", "), self.inner.metadata @@ -1284,6 +1332,7 @@ mod tests { inner: Arc::clone(&arrow_schema_ref), field_qualifiers: vec![None; arrow_schema_ref.fields.len()], functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(arrow_schema_ref.fields.len()), }; let df_schema_ref = Arc::new(df_schema.clone()); @@ -1330,6 +1379,7 @@ mod tests { inner: Arc::clone(&schema), field_qualifiers: vec![None; schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), + fields_spans: FieldsSpans::empty(schema.fields.len()), }; assert_eq!(df_schema.inner.metadata(), schema.metadata()) diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 8eed7e18005a..72ce99f78d00 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -54,7 +54,7 @@ pub mod with_span; pub use arrow; pub use column::Column; pub use dfschema::{ - qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, + qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, FieldsSpans, }; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, @@ -79,7 +79,7 @@ pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; pub use utils::project_schema; pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; -pub use with_span::WithSpan; +pub use with_span::WithSpans; // These are hidden from docs purely to avoid polluting the public view of what this crate exports. // These are just re-exports of macros by the same name, which gets around the 'cannot refer to diff --git a/datafusion/common/src/with_span.rs b/datafusion/common/src/with_span.rs index d2acddaa0d2b..d0a953b40578 100644 --- a/datafusion/common/src/with_span.rs +++ b/datafusion/common/src/with_span.rs @@ -1,40 +1,54 @@ -use std::{cmp::Ordering, fmt::{self, Debug, Display}, ops::{Deref, DerefMut}}; +use std::{ + cmp::Ordering, + fmt::{self, Debug, Display}, + ops::{Deref, DerefMut}, +}; +use derivative::Derivative; use sqlparser::tokenizer::Span; /// A wrapper type for entitites that somehow refer to a location in the source /// code. For example, a [`DataType`](arrow_schema::DataType) can be tied to an /// expression which is being planned for execution, and the expression is in /// turn located somewhere in the source code. -pub struct WithSpan { - pub span: Span, +#[derive(Clone, Derivative)] +#[derivative(PartialEq, Eq, PartialOrd, Ord)] +pub struct WithSpans { + #[derivative(PartialEq = "ignore", PartialOrd = "ignore", Ord = "ignore")] + pub spans: Vec, pub inner: T, } -impl WithSpan { +impl WithSpans { pub fn into_inner(self) -> T { self.inner } - pub fn new(inner: T, span: Span) -> Self { - Self { span, inner } + pub fn new(inner: T, spans: IT) -> Self + where + IT: IntoIterator, + { + Self { + spans: spans.into_iter().collect(), + inner, + } } pub fn new_without_span(inner: T) -> Self { Self { - span: Span::empty(), + spans: vec![], inner, } } } -impl From for WithSpan { +impl From for WithSpans { fn from(inner: T) -> Self { Self::new_without_span(inner) } } -impl Deref for WithSpan { +impl Deref for WithSpans { type Target = T; fn deref(&self) -> &Self::Target { @@ -42,55 +56,26 @@ impl Deref for WithSpan { } } -impl DerefMut for WithSpan { +impl DerefMut for WithSpans { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner } } -impl AsRef for WithSpan { +impl AsRef for WithSpans { fn as_ref(&self) -> &T { &self.inner } } -impl PartialEq for WithSpan { - fn eq(&self, other: &Self) -> bool { - self.inner.eq(&other.inner) - } -} - -impl Eq for WithSpan {} - -impl PartialOrd for WithSpan { - fn partial_cmp(&self, other: &Self) -> Option { - self.inner.partial_cmp(&other.inner) - } -} - -impl Ord for WithSpan { - fn cmp(&self, other: &Self) -> Ordering { - self.inner.cmp(&other.inner) - } -} - -impl Debug for WithSpan { +impl Debug for WithSpans { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.inner.fmt(f) } } -impl Display for WithSpan { +impl Display for WithSpans { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.inner.fmt(f) } } - -impl Clone for WithSpan { - fn clone(&self) -> Self { - Self { - span: self.span, - inner: self.inner.clone(), - } - } -} diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 82ee52d7b2e3..ce0d145445db 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1686,7 +1686,7 @@ impl DataFrame { let mut fields: Vec = plan .schema() .iter() - .filter_map(|(qualifier, field)| { + .filter_map(|(qualifier, field, _)| { if field.name() == name { col_exists = true; Some(new_column.clone()) @@ -1764,7 +1764,7 @@ impl DataFrame { .plan .schema() .iter() - .map(|(qualifier, field)| { + .map(|(qualifier, field, _)| { if qualifier.eq(&qualifier_rename) && field.as_ref() == field_rename { col(Column::from((qualifier, field))).alias(new_name) } else { diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 44537c951f94..96e1317a0a0c 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -963,8 +963,12 @@ impl DefaultPhysicalPlanner { // Remove temporary projected columns if left_projected || right_projected { - let final_join_result = - join_schema.iter().map(Expr::from).collect::>(); + let final_join_result = join_schema + .iter() + .map(|(q, f, spans)| { + Expr::from((q, f, spans.iter().copied())) + }) + .collect::>(); let projection = LogicalPlan::Projection(Projection::try_new( final_join_result, Arc::new(new_join), diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 954807eae226..4000170e2824 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -31,7 +31,7 @@ use arrow::datatypes::{ use datafusion_common::types::NativeType; use datafusion_common::{ exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, - Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, WithSpan, + Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, WithSpans, }; use itertools::Itertools; use sqlparser::tokenizer::Span; @@ -72,16 +72,16 @@ impl Signature { /// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs` fn signature<'a>( - lhs: impl Into>, - op: impl Into>, - rhs: impl Into>, + lhs: impl Into>, + op: impl Into>, + rhs: impl Into>, ) -> Result { use arrow::datatypes::DataType::*; use Operator::*; - let lhs: WithSpan<_> = lhs.into(); - let op: WithSpan<_> = op.into(); - let rhs: WithSpan<_> = rhs.into(); + let lhs: WithSpans<_> = lhs.into(); + let op: WithSpans<_> = op.into(); + let rhs: WithSpans<_> = rhs.into(); match *op { Eq | @@ -201,28 +201,28 @@ fn signature<'a>( plan_err!( "Cannot coerce arithmetic expression {lhs} {op} {rhs} to valid types" ).map_err(|err | { - if lhs.span == Span::empty() || rhs.span == Span::empty() { - err - } else { + if let (Some(&lhs_span), Some(&rhs_span)) = (lhs.spans.first(), rhs.spans.first()) { err.with_diagnostic(|_| Diagnostic { entries: vec![ DiagnosticEntry { kind: DiagnosticEntryKind::Error, message: "Binary expression has invalid types".to_string(), - span: lhs.span.union(&rhs.span), + span: lhs_span.union(&rhs_span), }, DiagnosticEntry { kind: DiagnosticEntryKind::Note, message: format!("Is of type {}", *lhs), - span: lhs.span, + span: lhs_span, }, DiagnosticEntry { kind: DiagnosticEntryKind::Note, message: format!("Is of type {}", *rhs), - span: rhs.span, + span: rhs_span, } ] }) + } else { + err } }) } @@ -232,18 +232,18 @@ fn signature<'a>( /// Returns the resulting type of a binary expression evaluating the `op` with the left and right hand types pub fn get_result_type<'a>( - lhs: impl Into>, - op: impl Into>, - rhs: impl Into>, + lhs: impl Into>, + op: impl Into>, + rhs: impl Into>, ) -> Result { signature(lhs, op, rhs).map(|sig| sig.ret) } /// Returns the coerced input types for a binary expression evaluating the `op` with the left and right hand types pub fn get_input_types<'a>( - lhs: impl Into>, - op: impl Into>, - rhs: impl Into>, + lhs: impl Into>, + op: impl Into>, + rhs: impl Into>, ) -> Result<(DataType, DataType)> { signature(lhs, op, rhs).map(|sig| (sig.lhs, sig.rhs)) } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index c5eaa4bf88e4..5fd4faa06242 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -354,6 +354,16 @@ impl<'a> From<(Option<&'a TableReference>, &'a FieldRef)> for Expr { } } +/// Same as [`From<(Option<&'a TableReference>, &'a FieldRef)>`] but with a span +impl<'a, IT> From<(Option<&'a TableReference>, &'a FieldRef, IT)> for Expr +where + IT: IntoIterator, +{ + fn from(value: (Option<&'a TableReference>, &'a FieldRef, IT)) -> Self { + Expr::from(Column::from((value.0, value.1)).with_spans(value.2)) + } +} + impl<'a> TreeNodeContainer<'a, Self> for Expr { fn apply_elements Result>( &'a self, @@ -1114,7 +1124,11 @@ impl Expr { /// output schema. We can use this qualified name to reference the field. pub fn qualified_name(&self) -> (Option, String) { match self { - Expr::Column(Column { relation, name, span: _ }) => (relation.clone(), name.clone()), + Expr::Column(Column { + relation, + name, + spans: _, + }) => (relation.clone(), name.clone()), Expr::Alias(Alias { relation, name, .. }) => (relation.clone(), name.clone()), _ => (None, self.schema_name().to_string()), } @@ -1667,9 +1681,10 @@ impl Expr { } } - pub fn get_span(&self) -> Option { + pub fn get_spans(&self) -> Option<&Vec> { match self { - Expr::Column(Column{span, ..}) => Some(*span), + Expr::Column(Column { spans, .. }) => Some(spans), + Expr::Alias(Alias { expr, .. }) => expr.get_spans(), _ => None, } } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index c32f217c933b..2c5c83999038 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -160,7 +160,7 @@ pub fn unnormalize_col(expr: Expr) -> Expr { let col = Column { relation: None, name: c.name, - span: c.span, + spans: c.spans, }; Transformed::yes(Expr::Column(col)) } else { @@ -182,10 +182,14 @@ pub fn create_col_from_scalar_expr( Some::(subqry_alias.into()), name, )), - Expr::Column(Column { relation: _, name, span }) => Ok(Column::new( - Some::(subqry_alias.into()), + Expr::Column(Column { + relation: _, name, - ).with_span(*span)), + spans, + }) => Ok( + Column::new(Some::(subqry_alias.into()), name) + .with_spans(spans.iter().copied()), + ), _ => { let scalar_column = scalar_expr.schema_name().to_string(); Ok(Column::new( @@ -232,7 +236,11 @@ pub fn coerce_plan_expr_for_schema( Ok(LogicalPlan::Projection(projection)) } _ => { - let exprs: Vec = plan.schema().iter().map(Expr::from).collect(); + let exprs: Vec = plan + .schema() + .iter() + .map(|(q, f, spans)| Expr::from((q, f, spans.iter().copied()))) + .collect(); let new_exprs = coerce_exprs_for_schema(exprs, plan.schema(), schema)?; let add_project = new_exprs.iter().any(|expr| expr.try_as_col().is_none()); if add_project { diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index c49197f78eb9..35d0fc2807db 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -103,7 +103,7 @@ fn rewrite_in_terms_of_projection( relation: None, name, // Span is not used in PartialEq - span: Span::empty(), + spans: vec![], }); // look for the column named the same as this expr diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 88dd74ded7d5..96ff975cf9e3 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -29,7 +29,7 @@ use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ not_impl_err, plan_datafusion_err, plan_err, Column, ExprSchema, Result, - TableReference, WithSpan, + TableReference, WithSpans, }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use recursive::recursive; @@ -405,16 +405,16 @@ impl ExprSchemable for Expr { }) => { let (left_type, left_is_nullable) = left.data_type_and_nullable(schema)?; - let left_type = if let Some(span) = left.get_span() { - WithSpan::new(&left_type, span) + let left_type = if let Some(spans) = left.get_spans() { + WithSpans::new(&left_type, spans.iter().copied()) } else { (&left_type).into() }; let (right_type, right_is_nullable) = right.data_type_and_nullable(schema)?; - let right_type = if let Some(span) = right.get_span() { - WithSpan::new(&right_type, span) + let right_type = if let Some(spans) = right.get_spans() { + WithSpans::new(&right_type, spans.iter().copied()) } else { (&right_type).into() }; diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 4a8c5d08e284..0e810a623b96 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -52,13 +52,11 @@ use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::file_options::file_type::FileType; use datafusion_common::{ - exec_err, get_target_functional_dependencies, internal_err, not_impl_err, - plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, - FunctionalDependencies, Result, ScalarValue, TableReference, ToDFSchema, - UnnestOptions, + exec_err, get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, FieldsSpans, FunctionalDependencies, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions }; use datafusion_expr_common::type_coercion::binary::type_union_resolution; use indexmap::IndexSet; +use sqlparser::tokenizer::Span; /// Default table name for unnamed table pub const UNNAMED_TABLE: &str = "?table?"; @@ -1340,7 +1338,7 @@ pub fn change_redundant_column(fields: &Fields) -> Vec { fn mark_field(schema: &DFSchema) -> (Option, Arc) { let mut table_references = schema .iter() - .filter_map(|(qualifier, _)| qualifier) + .filter_map(|(qualifier, _, _)| qualifier) .collect::>(); table_references.dedup(); let table_reference = if table_references.len() == 1 { @@ -1374,8 +1372,8 @@ pub fn build_join_schema( .collect() } - let right_fields = right.iter(); - let left_fields = left.iter(); + let right_fields = right.iter().map(|(q, f, _)| (q, f)); + let left_fields = left.iter().map(|(q, f, _)| (q, f)); let qualified_fields: Vec<(Option, Arc)> = match join_type { JoinType::Inner => { @@ -1437,6 +1435,9 @@ pub fn build_join_schema( join_type, left.fields().len(), ); + let left_fields_spans: FieldsSpans = left.iter().map(|(_, _, spans)| spans).cloned().collect(); + let right_fields_spans: FieldsSpans = right.iter().map(|(_, _, spans)| spans).cloned().collect(); + let fields_spans = left_fields_spans.join(&right_fields_spans, join_type, left.fields().len()); let metadata = left .metadata() .clone() @@ -1444,7 +1445,8 @@ pub fn build_join_schema( .chain(right.metadata().clone()) .collect(); let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?; - dfschema.with_functional_dependencies(func_dependencies) + let dfschema = dfschema.with_functional_dependencies(func_dependencies)?; + Ok(dfschema.with_fields_spans(fields_spans)) } /// Add additional "synthetic" group by expressions based on functional @@ -1863,7 +1865,7 @@ pub fn unnest_with_options( let fields = input_schema .iter() .enumerate() - .map(|(index, (original_qualifier, original_field))| { + .map(|(index, (original_qualifier, original_field, _))| { match indices_to_unnest.get(&index) { Some(column_to_unnest) => { let recursions_on_column = options @@ -2212,7 +2214,7 @@ mod tests { Column { relation: Some(TableReference::Bare { table }), name, - span: _, + spans: _, }, }, _, diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index fe725e7d96de..af6c0ffec5cd 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2163,13 +2163,19 @@ impl Projection { /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result> { + let metadata = input.schema().metadata().clone(); + let fields_spans = exprs + .iter() + .map(|e| e.get_spans().cloned().unwrap_or_else(|| vec![])) + .collect(); let schema = DFSchema::new_with_metadata(exprlist_to_fields(exprs, input)?, metadata)? .with_functional_dependencies(calc_func_dependencies_for_project( exprs, input, - )?)?; + )?)? + .with_fields_spans(fields_spans); Ok(Arc::new(schema)) } @@ -2200,9 +2206,11 @@ impl SubqueryAlias { // Since schema is the same, other than qualifier, we can use existing // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); + let fields_spans = plan.schema().fields_spans().clone(); let schema = DFSchemaRef::new( DFSchema::try_from_qualified_schema(alias.clone(), &schema)? - .with_functional_dependencies(func_dependencies)?, + .with_functional_dependencies(func_dependencies)? + .with_fields_spans(fields_spans), ); Ok(SubqueryAlias { input: plan, @@ -2379,7 +2387,7 @@ impl Window { let fields: Vec<(Option, Arc)> = input .schema() .iter() - .map(|(q, f)| (q.cloned(), Arc::clone(f))) + .map(|(q, f, _)| (q.cloned(), Arc::clone(f))) .collect(); let input_len = fields.len(); let mut window_fields = fields; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 6f7c5d379260..85ab065b7962 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -363,7 +363,10 @@ fn get_exprs_except_skipped( columns_to_skip: HashSet, ) -> Vec { if columns_to_skip.is_empty() { - schema.iter().map(Expr::from).collect::>() + schema + .iter() + .map(|(q, f, spans)| Expr::from((q, f, spans.iter().copied()))) + .collect::>() } else { schema .columns() diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs index 1853f0aaf3ec..225b6098a9eb 100644 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -123,7 +123,7 @@ fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { Expr::Column(Column { ref relation, ref name, - span, + ref spans, }) => { if name.eq("*") { let expanded_columns = if let Some(qualifier) = relation { @@ -142,7 +142,7 @@ fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { projected_expr.extend(expanded_columns.into_iter().map(|mut expr| { if let Expr::Column(c) = &mut expr { - c.span = span; + c.spans = spans.clone(); } expr })); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 1bd4009b47cc..c5aeb0ccddc9 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -574,7 +574,7 @@ fn transform_schema_to_nonview(dfschema: &DFSchemaRef) -> Option>, Vec>) = dfschema .iter() - .map(|(qualifier, field)| match field.data_type() { + .map(|(qualifier, field, _)| match field.data_type() { DataType::Utf8View => { transformed = true; ( @@ -974,7 +974,7 @@ pub fn coerce_union_schema(inputs: &[Arc]) -> Result { union_nullabilities, union_field_meta.into_iter() ) - .map(|((qualifier, field), datatype, nullable, metadata)| { + .map(|((qualifier, field, _), datatype, nullable, metadata)| { let mut field = Field::new(field.name().clone(), datatype, nullable); field.set_metadata(metadata); (qualifier.cloned(), field.into()) @@ -1000,7 +1000,7 @@ fn project_with_column_index( Expr::Column(Column { relation: _, ref name, - span: _, + spans: _, }) if name != schema.field(i).name() => e.alias(schema.field(i).name()), Expr::Alias { .. } | Expr::Column { .. } => e, _ => e.alias(schema.field(i).name()), diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 0ea2d24effbb..163d0966aff6 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -751,7 +751,7 @@ fn build_common_expr_project_plan( }) .collect::>>()?; - for (qualifier, field) in input.schema().iter() { + for (qualifier, field, _) in input.schema().iter() { if fields_set.insert(qualified_name(qualifier, field.name())) { project_exprs.push(Expr::from((qualifier, field))); } @@ -769,7 +769,10 @@ fn build_recover_project_plan( schema: &DFSchema, input: LogicalPlan, ) -> Result { - let col_exprs = schema.iter().map(Expr::from).collect(); + let col_exprs = schema + .iter() + .map(|(q, f, spans)| Expr::from((q, f, spans.iter().copied()))) + .collect(); Projection::try_new(col_exprs, Arc::new(input)).map(LogicalPlan::Projection) } diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index 5ebd73799dd5..5c519dce8fbb 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -532,7 +532,7 @@ fn proj_exprs_evaluation_result_on_empty_batch( let result_expr = simplifier.simplify(result_expr)?; let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), - Expr::Column(Column { relation: _, name, span: _ }) => name.to_string(), + Expr::Column(Column { relation: _, name, spans: _ }) => name.to_string(), _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index fe751a5fb583..ba4bd5734fb0 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -238,7 +238,7 @@ impl<'a> ColumnChecker<'a> { fn schema_columns(schema: &DFSchema) -> HashSet { schema .iter() - .flat_map(|(qualifier, field)| { + .flat_map(|(qualifier, field, _)| { [ Column::new(qualifier.cloned(), field.name()), // we need to push down filter using unqualified column as well @@ -818,7 +818,7 @@ impl OptimizerRule for PushDownFilter { } LogicalPlan::SubqueryAlias(subquery_alias) => { let mut replace_map = HashMap::new(); - for (i, (qualifier, field)) in + for (i, (qualifier, field, _)) in subquery_alias.input.schema().iter().enumerate() { let (sub_qualifier, sub_field) = @@ -910,7 +910,7 @@ impl OptimizerRule for PushDownFilter { let mut inputs = Vec::with_capacity(union.inputs.len()); for input in &union.inputs { let mut replace_map = HashMap::new(); - for (i, (qualifier, field)) in input.schema().iter().enumerate() { + for (i, (qualifier, field, _)) in input.schema().iter().enumerate() { let (union_qualifier, union_field) = union.schema.qualified_field(i); replace_map.insert( @@ -1156,7 +1156,7 @@ fn rewrite_projection( .schema .iter() .zip(projection.expr.iter()) - .map(|((qualifier, field), expr)| { + .map(|((qualifier, field, _), expr)| { // strip alias, as they should not be part of filters let expr = expr.clone().unalias(); diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs index 48b2828faf45..4787ba8d4ca7 100644 --- a/datafusion/optimizer/src/replace_distinct_aggregate.rs +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -157,7 +157,7 @@ impl OptimizerRule for ReplaceDistinctWithAggregate { .iter() .skip(expr_cnt) .zip(schema.iter()) - .map(|((new_qualifier, new_field), (old_qualifier, old_field))| { + .map(|((new_qualifier, new_field, _), (old_qualifier, old_field, _))| { col(Column::from((new_qualifier, new_field))) .alias_qualified(old_qualifier.cloned(), old_field.name()) }) diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 14c7bb8f51c1..735221eae6d5 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -63,7 +63,7 @@ impl SqlToRel<'_, S> { return Ok(Expr::Column(Column { relation: qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(), name: normalize_ident, - span: id_span, + spans: vec![id_span], })); } @@ -84,7 +84,7 @@ impl SqlToRel<'_, S> { Ok(Expr::Column(Column { relation: None, name: normalize_ident, - span: id_span, + spans: vec![id_span], })) } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 0195f3d32032..c08a9fd44e04 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -150,14 +150,14 @@ impl SqlToRel<'_, S> { match expr { Expr::Column(col) => match &col.relation { Some(q) => { - match schema.iter().find(|(qualifier, field)| match qualifier { + match schema.iter().find(|(qualifier, field, _)| match qualifier { Some(field_q) => { field.name() == &col.name && field_q.to_string().ends_with(&format!(".{q}")) } _ => false, }) { - Some((qualifier, df_field)) => Expr::from((qualifier, df_field)), + Some((qualifier, df_field, _)) => Expr::from((qualifier, df_field)), None => Expr::Column(col), } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 0b1481544d05..bb5aa305bc8b 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -38,6 +38,7 @@ use datafusion_common::{ use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::utils::find_column_exprs; use datafusion_expr::{col, Expr}; +use sqlparser::tokenizer::Span; use crate::utils::{make_decimal_type, value_to_string}; pub use datafusion_expr::planner::ContextProvider; @@ -393,7 +394,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { col.name, r ), DiagnosticEntryKind::Error, - col.span, + col.spans.first().copied().unwrap_or(Span::empty()), )]) }) }, @@ -414,7 +415,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Diagnostic::new([DiagnosticEntry::new( format!("No field named '{}'", col.name), DiagnosticEntryKind::Error, - col.span, + col.spans.first().copied().unwrap_or(Span::empty()), )]) }) }), diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 6d1b3f185ae1..48abe99cc941 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -814,7 +814,11 @@ impl SqlToRel<'_, S> { Diagnostic::new([DiagnosticEntry::new( "GROUP BY clause is here", DiagnosticEntryKind::Note, - Span::union_iter(group_by_exprs.iter().filter_map(|e| e.get_span())), + Span::union_iter( + group_by_exprs + .iter() + .flat_map(|e| e.get_spans().cloned().unwrap_or_default()), + ), )]) }) })?; diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 47a842643b8d..572ac8221b1c 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1739,7 +1739,7 @@ impl SqlToRel<'_, S> { // Build updated values for each column, using the previous value if not modified let exprs = table_schema .iter() - .map(|(qualifier, field)| { + .map(|(qualifier, field, _)| { let expr = match assign_map.remove(field.name()) { Some(new_value) => { let mut expr = self.sql_to_expr( diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 5f015c6465fc..8118aa7fa845 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -911,12 +911,12 @@ impl Unparser<'_> { Expr::Column(Column { relation: _, name: left_name, - span: _, + spans: _, }), Expr::Column(Column { relation: _, name: right_name, - span: _, + spans: _, }), ) if left_name == right_name => { idents.push(self.new_ident_quoted_if_needs(left_name.to_string())); diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index b2133d463f87..cfe9feb7ce03 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -101,8 +101,12 @@ pub enum CheckColumnsSatisfyExprsPurpose { impl CheckColumnsSatisfyExprsPurpose { fn get_error_message(&self) -> &'static str { match self { - CheckColumnsSatisfyExprsPurpose::GroupBy => "Projection references non-aggregate values", - CheckColumnsSatisfyExprsPurpose::Having => "HAVING clause references non-aggregate values", + CheckColumnsSatisfyExprsPurpose::GroupBy => { + "Projection references non-aggregate values" + } + CheckColumnsSatisfyExprsPurpose::Having => { + "HAVING clause references non-aggregate values" + } } } } @@ -158,19 +162,30 @@ fn check_column_satisfies_expr( ) .map_err(|err| { let message = match call_purpose { - CheckColumnsSatisfyExprsPurpose::GroupBy => format!("'{}' in projection does not appear in GROUP BY clause", expr.to_string()), - CheckColumnsSatisfyExprsPurpose::Having => format!("'{}' in HAVING clause does not appear in GROUP BY clause", expr.to_string()), + CheckColumnsSatisfyExprsPurpose::GroupBy => format!( + "'{}' in projection does not appear in GROUP BY clause", + expr.to_string() + ), + CheckColumnsSatisfyExprsPurpose::Having => format!( + "'{}' in HAVING clause does not appear in GROUP BY clause", + expr.to_string() + ), }; err.with_diagnostic(|_| { - Diagnostic::new([DiagnosticEntry::new( - message, - DiagnosticEntryKind::Error, - expr.get_span().unwrap_or(Span::empty()), - ),DiagnosticEntry::new( - format!("Add '{}' to GROUP BY clause", expr.to_string()), - DiagnosticEntryKind::Help, - Span::empty(), - )]) + Diagnostic::new([ + DiagnosticEntry::new( + message, + DiagnosticEntryKind::Error, + expr.get_spans() + .and_then(|spans| spans.first().copied()) + .unwrap_or(Span::empty()), + ), + DiagnosticEntry::new( + format!("Add '{}' to GROUP BY clause", expr.to_string()), + DiagnosticEntryKind::Help, + Span::empty(), + ), + ]) }) }); } From 03afbf675fd2353d6b27f77c80faa5ae417e0d0e Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Tue, 10 Dec 2024 10:04:09 +0100 Subject: [PATCH 6/8] feat: UNION wrong number of fields diagnostics --- datafusion/expr/src/logical_plan/builder.rs | 42 ++++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index da3903615bec..445cf3dea3a6 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -52,7 +52,11 @@ use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::file_options::file_type::FileType; use datafusion_common::{ - exec_err, get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, FieldsSpans, FunctionalDependencies, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions + exec_err, get_target_functional_dependencies, internal_err, not_impl_err, + plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, + Diagnostic, DiagnosticEntry, DiagnosticEntryKind, FieldsSpans, + FunctionalDependencies, Result, ScalarValue, TableReference, ToDFSchema, + UnnestOptions, }; use datafusion_expr_common::type_coercion::binary::type_union_resolution; @@ -1436,9 +1440,12 @@ pub fn build_join_schema( join_type, left.fields().len(), ); - let left_fields_spans: FieldsSpans = left.iter().map(|(_, _, spans)| spans).cloned().collect(); - let right_fields_spans: FieldsSpans = right.iter().map(|(_, _, spans)| spans).cloned().collect(); - let fields_spans = left_fields_spans.join(&right_fields_spans, join_type, left.fields().len()); + let left_fields_spans: FieldsSpans = + left.iter().map(|(_, _, spans)| spans).cloned().collect(); + let right_fields_spans: FieldsSpans = + right.iter().map(|(_, _, spans)| spans).cloned().collect(); + let fields_spans = + left_fields_spans.join(&right_fields_spans, join_type, left.fields().len()); let metadata = left .metadata() .clone() @@ -1526,7 +1533,32 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result Date: Tue, 10 Dec 2024 12:23:24 +0100 Subject: [PATCH 7/8] feat: collect more errors --- datafusion/common/src/error.rs | 64 ++++++++++++++++++++++++++++++++++ datafusion/expr/src/utils.rs | 20 +++++++---- datafusion/sql/src/utils.rs | 20 ++++++++--- 3 files changed, 94 insertions(+), 10 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index f1a816ceac29..861fff185c75 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -133,6 +133,7 @@ pub enum DataFusionError { /// or serializing/deserializing protobytes to Substrait plans Substrait(String), Diagnostic(Diagnostic, Box), + Collection(Vec), } #[macro_export] @@ -319,6 +320,7 @@ impl Error for DataFusionError { DataFusionError::Context(_, e) => Some(e.as_ref()), DataFusionError::Substrait(_) => None, DataFusionError::Diagnostic(_, e) => Some(e.as_ref()), + DataFusionError::Collection(_) => None, } } } @@ -442,6 +444,7 @@ impl DataFusionError { DataFusionError::Context(_, _) => "", DataFusionError::Substrait(_) => "Substrait error: ", DataFusionError::Diagnostic(_, _) => "", + DataFusionError::Collection(_) => "", } } @@ -483,6 +486,13 @@ impl DataFusionError { } DataFusionError::Substrait(ref desc) => Cow::Owned(desc.to_string()), DataFusionError::Diagnostic(_, ref err) => Cow::Owned(err.to_string()), + DataFusionError::Collection(ref v) => Cow::Owned(format!( + "[{}]", + v.iter() + .map(|e| e.to_string()) + .collect::>() + .join(", ") + )), } } @@ -517,6 +527,60 @@ impl DataFusionError { DiagnosticsIterator { head: self } } + + pub fn get_individual_errors(&self) -> impl Iterator + '_ { + fn contains_collection(err: &DataFusionError) -> bool { + let mut head = err; + loop { + if let DataFusionError::Collection(_) = head { + return true; + } + + if let Some(source) = head + .source() + .map(|source| source.downcast_ref::()) + .flatten() + { + head = source; + } else { + return false; + } + } + } + + struct IndividualErrorsIterator<'a> { + queue: Vec<&'a DataFusionError>, + } + + impl<'a> Iterator for IndividualErrorsIterator<'a> { + type Item = &'a DataFusionError; + + fn next(&mut self) -> Option { + while let Some(err) = self.queue.pop() { + if !contains_collection(err) { + return Some(err); + } + + if let DataFusionError::Collection(errs) = err { + self.queue.extend(errs.iter()); + continue; + } + + if let Some(source) = err + .source() + .map(|source| source.downcast_ref::()) + .flatten() + { + self.queue.push(source); + } + } + + None + } + } + + IndividualErrorsIterator { queue: vec![self] } + } } /// Unwrap an `Option` if possible. Otherwise return an `DataFusionError::Internal`. diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 85ab065b7962..5d9556a2f040 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -703,7 +703,7 @@ pub fn exprlist_to_fields<'a>( // Look for exact match in plan's output schema let wildcard_schema = find_base_plan(plan).schema(); let input_schema = plan.schema(); - let result = exprs + let (fields, errs) = exprs .into_iter() .map(|e| match e { Expr::Wildcard { qualifier, options } => match qualifier { @@ -759,11 +759,19 @@ pub fn exprlist_to_fields<'a>( }, _ => Ok(vec![e.to_field(input_schema)?]), }) - .collect::>>()? - .into_iter() - .flatten() - .collect(); - Ok(result) + .fold((vec![], vec![]), |(mut fields, mut errs), result| { + match result { + Ok(this_fields) => fields.extend(this_fields), + Err(err) => errs.push(err), + } + (fields, errs) + }); + + if !errs.is_empty() { + Err(DataFusionError::Collection(errs)) + } else { + Ok(fields) + } } /// Find the suitable base plan to expand the wildcard expression recursively. diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index d4cdd5379389..432d04510c6a 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -123,29 +123,41 @@ pub(crate) fn check_columns_satisfy_exprs( Expr::Column(_) => Ok(()), _ => internal_err!("Expr::Column are required"), })?; + let mut errs = vec![]; let column_exprs = find_column_exprs(exprs); for e in &column_exprs { match e { Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, call_purpose)?; + if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + errs.push(err); + } } } Expr::GroupingSet(GroupingSet::Cube(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, call_purpose)?; + if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + errs.push(err); + } } } Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { for exprs in lists_of_exprs { for e in exprs { - check_column_satisfies_expr(columns, e, call_purpose)?; + if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + errs.push(err); + } } } } - _ => check_column_satisfies_expr(columns, e, call_purpose)?, + _ => if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + errs.push(err); + }, } } + if !errs.is_empty() { + return Err(DataFusionError::Collection(errs)); + } Ok(()) } From 0506a80838ba3a40120089474bb727b94db30b81 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Tue, 10 Dec 2024 12:47:20 +0100 Subject: [PATCH 8/8] feat: ignorable errors --- datafusion/common/src/column.rs | 7 +- .../common/src/dfschema/fields_spans.rs | 4 +- datafusion/common/src/dfschema/mod.rs | 1 - datafusion/common/src/error.rs | 37 +++++--- datafusion/common/src/lib.rs | 6 +- datafusion/common/src/with_span.rs | 1 - .../expr-common/src/type_coercion/binary.rs | 1 - datafusion/expr/Cargo.toml | 1 + datafusion/expr/src/expr.rs | 37 +++++++- datafusion/expr/src/expr_rewriter/mod.rs | 1 + datafusion/expr/src/expr_rewriter/order_by.rs | 1 - datafusion/expr/src/expr_schema.rs | 9 +- datafusion/expr/src/logical_plan/builder.rs | 10 +- datafusion/expr/src/logical_plan/plan.rs | 6 +- datafusion/expr/src/tree_node.rs | 11 ++- .../src/analyzer/expand_wildcard_rule.rs | 27 +++--- .../optimizer/src/analyzer/type_coercion.rs | 1 + datafusion/optimizer/src/decorrelate.rs | 7 +- .../optimizer/src/optimize_projections/mod.rs | 1 + .../src/replace_distinct_aggregate.rs | 13 ++- datafusion/proto-common/src/to_proto/mod.rs | 2 +- datafusion/sql/src/expr/identifier.rs | 5 +- datafusion/sql/src/expr/mod.rs | 4 +- datafusion/sql/src/planner.rs | 5 +- datafusion/sql/src/relation/join.rs | 57 +++++++---- datafusion/sql/src/relation/mod.rs | 9 +- datafusion/sql/src/select.rs | 94 +++++++++++++++---- datafusion/sql/src/statement.rs | 9 +- datafusion/sql/src/unparser/plan.rs | 2 + datafusion/sql/src/unparser/rewrite.rs | 1 + datafusion/sql/src/utils.rs | 31 +++--- 31 files changed, 276 insertions(+), 125 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 0ddd1f0a4112..29be76e56d38 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -33,8 +33,8 @@ use std::fmt; use std::str::FromStr; /// A named reference to a qualified field in a schema. -#[derive(Debug, Clone, Derivative)] -#[derivative(PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Derivative)] +#[derivative(PartialEq, Eq, Hash, PartialOrd, Ord, Clone)] pub struct Column { /// relation/table reference. pub relation: Option, @@ -254,8 +254,7 @@ impl Column { .flat_map(|s| s.columns_with_unqualified_name(&self.name)) .collect::>(); for using_col in using_columns { - let all_matched = - columns.iter().all(|c| using_col.contains(c)); + let all_matched = columns.iter().all(|c| using_col.contains(c)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifier from the first match. if all_matched { diff --git a/datafusion/common/src/dfschema/fields_spans.rs b/datafusion/common/src/dfschema/fields_spans.rs index bbfa9f2783a7..c4fd0d8406a0 100644 --- a/datafusion/common/src/dfschema/fields_spans.rs +++ b/datafusion/common/src/dfschema/fields_spans.rs @@ -9,7 +9,7 @@ pub struct FieldsSpans(Vec>); impl FieldsSpans { pub fn empty(field_count: usize) -> Self { - Self((0..field_count).into_iter().map(|_| Vec::new()).collect()) + Self((0..field_count).map(|_| Vec::new()).collect()) } pub fn iter(&self) -> impl Iterator> { @@ -20,7 +20,7 @@ impl FieldsSpans { &self, other: &FieldsSpans, join_type: &JoinType, - left_cols_len: usize, + _left_cols_len: usize, ) -> FieldsSpans { match join_type { JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { diff --git a/datafusion/common/src/dfschema/mod.rs b/datafusion/common/src/dfschema/mod.rs index aadcfc423d12..eb07d2705c81 100644 --- a/datafusion/common/src/dfschema/mod.rs +++ b/datafusion/common/src/dfschema/mod.rs @@ -20,7 +20,6 @@ mod fields_spans; pub use fields_spans::FieldsSpans; -use std::backtrace::Backtrace; use std::collections::{BTreeSet, HashMap, HashSet}; use std::fmt::{Display, Formatter}; use std::hash::Hash; diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 861fff185c75..46387c982eba 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -28,7 +28,7 @@ use std::sync::Arc; use crate::diagnostic::Diagnostic; use crate::utils::quote_identifier; -use crate::{Column, DFSchema, DiagnosticEntry, DiagnosticEntryKind, TableReference}; +use crate::{Column, DFSchema, TableReference}; #[cfg(feature = "avro")] use apache_avro::Error as AvroError; use arrow::error::ArrowError; @@ -514,8 +514,7 @@ impl DataFusionError { if let Some(source) = self .head .source() - .map(|source| source.downcast_ref::()) - .flatten() + .and_then(|source| source.downcast_ref::()) { self.head = source; } else { @@ -528,7 +527,9 @@ impl DataFusionError { DiagnosticsIterator { head: self } } - pub fn get_individual_errors(&self) -> impl Iterator + '_ { + pub fn get_individual_errors( + &self, + ) -> impl Iterator, &Self)> + '_ { fn contains_collection(err: &DataFusionError) -> bool { let mut head = err; loop { @@ -538,8 +539,7 @@ impl DataFusionError { if let Some(source) = head .source() - .map(|source| source.downcast_ref::()) - .flatten() + .and_then(|source| source.downcast_ref::()) { head = source; } else { @@ -549,29 +549,34 @@ impl DataFusionError { } struct IndividualErrorsIterator<'a> { - queue: Vec<&'a DataFusionError>, + queue: Vec<(&'a DataFusionError, Vec)>, } impl<'a> Iterator for IndividualErrorsIterator<'a> { - type Item = &'a DataFusionError; + type Item = (Vec, &'a DataFusionError); fn next(&mut self) -> Option { - while let Some(err) = self.queue.pop() { + while let Some((err, mut diagnostics_prefix)) = self.queue.pop() { if !contains_collection(err) { - return Some(err); + return Some((diagnostics_prefix, err)); } if let DataFusionError::Collection(errs) = err { - self.queue.extend(errs.iter()); + self.queue.extend( + errs.iter().map(|err| (err, diagnostics_prefix.clone())), + ); continue; } + if let DataFusionError::Diagnostic(diagnostics, _) = err { + diagnostics_prefix.push(diagnostics.clone()); + } + if let Some(source) = err .source() - .map(|source| source.downcast_ref::()) - .flatten() + .and_then(|source| source.downcast_ref::()) { - self.queue.push(source); + self.queue.push((source, diagnostics_prefix)); } } @@ -579,7 +584,9 @@ impl DataFusionError { } } - IndividualErrorsIterator { queue: vec![self] } + IndividualErrorsIterator { + queue: vec![(self, vec![])], + } } } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 72ce99f78d00..46dea48e1787 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -33,6 +33,7 @@ pub mod alias; pub mod cast; pub mod config; pub mod cse; +pub mod diagnostic; pub mod display; pub mod error; pub mod file_options; @@ -47,15 +48,15 @@ pub mod test_util; pub mod tree_node; pub mod types; pub mod utils; -pub mod diagnostic; pub mod with_span; /// Reexport arrow crate pub use arrow; pub use column::Column; pub use dfschema::{ - qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, FieldsSpans, + qualified_name, DFSchema, DFSchemaRef, ExprSchema, FieldsSpans, SchemaExt, ToDFSchema, }; +pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, @@ -78,7 +79,6 @@ pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; pub use utils::project_schema; -pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; pub use with_span::WithSpans; // These are hidden from docs purely to avoid polluting the public view of what this crate exports. diff --git a/datafusion/common/src/with_span.rs b/datafusion/common/src/with_span.rs index d0a953b40578..d0e3c6d5e82b 100644 --- a/datafusion/common/src/with_span.rs +++ b/datafusion/common/src/with_span.rs @@ -1,5 +1,4 @@ use std::{ - cmp::Ordering, fmt::{self, Debug, Display}, ops::{Deref, DerefMut}, }; diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 847fd6339b22..40fbf6c272bd 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -34,7 +34,6 @@ use datafusion_common::{ Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, WithSpans, }; use itertools::Itertools; -use sqlparser::tokenizer::Span; /// The type signature of an instantiation of binary operator expression such as /// `lhs + rhs` diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 438662e0642b..fcaf96791952 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -56,6 +56,7 @@ serde_json = { workspace = true } sqlparser = { workspace = true } strum = { version = "0.26.1", features = ["derive"] } strum_macros = "0.26.0" +derivative = { workspace = true } [dev-dependencies] ctor = { workspace = true } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 5fd4faa06242..f2e5891feb4d 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -38,6 +38,7 @@ use datafusion_common::{ plan_err, Column, DFSchema, HashMap, Result, ScalarValue, TableReference, }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; +use derivative::Derivative; use sqlparser::ast::{ display_comma_separated, ExceptSelectItem, ExcludeSelectItem, IlikeSelectItem, NullTreatment, RenameSelectItem, ReplaceSelectElement, @@ -401,11 +402,19 @@ impl Unnest { } /// Alias expression -#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +#[derive(Clone, Derivative, Debug)] +#[derivative(PartialEq, Eq, PartialOrd, Hash)] pub struct Alias { pub expr: Box, pub relation: Option, pub name: String, + #[derivative( + PartialEq = "ignore", + Hash = "ignore", + PartialOrd = "ignore", + Ord = "ignore" + )] + pub span: Span, } impl Alias { @@ -419,8 +428,13 @@ impl Alias { expr: Box::new(expr), relation: relation.map(|r| r.into()), name: name.into(), + span: Span::empty(), } } + + pub fn with_span(self, span: Span) -> Self { + Self { span, ..self } + } } /// Binary expression @@ -1128,6 +1142,7 @@ impl Expr { relation, name, spans: _, + .. }) => (relation.clone(), name.clone()), Expr::Alias(Alias { relation, name, .. }) => (relation.clone(), name.clone()), _ => (None, self.schema_name().to_string()), @@ -1681,11 +1696,22 @@ impl Expr { } } - pub fn get_spans(&self) -> Option<&Vec> { + pub fn get_span(&self) -> Span { match self { - Expr::Column(Column { spans, .. }) => Some(spans), - Expr::Alias(Alias { expr, .. }) => expr.get_spans(), - _ => None, + Expr::Column(Column { spans, .. }) => match spans.as_slice() { + [] => panic!("No spans for column expr"), + [span] => *span, + _ => panic!("Column expr has more than one span"), + }, + Expr::Alias(Alias { + expr, + span: alias_span, + .. + }) => { + let span = expr.get_span(); + span.union(alias_span) + } + _ => Span::empty(), } } } @@ -1701,6 +1727,7 @@ impl HashNode for Expr { expr: _expr, relation, name, + .. }) => { relation.hash(state); name.hash(state); diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 2c5c83999038..f2557fb11281 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -186,6 +186,7 @@ pub fn create_col_from_scalar_expr( relation: _, name, spans, + .. }) => Ok( Column::new(Some::(subqry_alias.into()), name) .with_spans(spans.iter().copied()), diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index 35d0fc2807db..258991914012 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -23,7 +23,6 @@ use crate::{expr::Sort, Cast, Expr, LogicalPlan, TryCast}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{Column, Result}; -use sqlparser::tokenizer::Span; /// Rewrite sort on aggregate expressions to sort on the column of aggregate output /// For example, `max(x)` is written to `col("max(x)")` diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 6bb909a93111..0a28bc4707c0 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -33,6 +33,7 @@ use datafusion_common::{ }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use recursive::recursive; +use sqlparser::tokenizer::Span; use std::collections::HashMap; use std::sync::Arc; @@ -405,16 +406,16 @@ impl ExprSchemable for Expr { }) => { let (left_type, left_is_nullable) = left.data_type_and_nullable(schema)?; - let left_type = if let Some(spans) = left.get_spans() { - WithSpans::new(&left_type, spans.iter().copied()) + let left_type = if left.get_span() != Span::empty() { + WithSpans::new(&left_type, [left.get_span()]) } else { (&left_type).into() }; let (right_type, right_is_nullable) = right.data_type_and_nullable(schema)?; - let right_type = if let Some(spans) = right.get_spans() { - WithSpans::new(&right_type, spans.iter().copied()) + let right_type = if right.get_span() != Span::empty() { + WithSpans::new(&right_type, [right.get_span()]) } else { (&right_type).into() }; diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 445cf3dea3a6..75a0b3433089 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1543,14 +1543,20 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result Result> { - let metadata = input.schema().metadata().clone(); - let fields_spans = exprs - .iter() - .map(|e| e.get_spans().cloned().unwrap_or_else(|| vec![])) - .collect(); + let fields_spans = exprs.iter().map(|e| vec![e.get_span()]).collect(); let schema = DFSchema::new_with_metadata(exprlist_to_fields(exprs, input)?, metadata)? diff --git a/datafusion/expr/src/tree_node.rs b/datafusion/expr/src/tree_node.rs index eacace5ed046..a017953fd72d 100644 --- a/datafusion/expr/src/tree_node.rs +++ b/datafusion/expr/src/tree_node.rs @@ -127,7 +127,16 @@ impl TreeNode for Expr { expr, relation, name, - }) => f(*expr)?.update_data(|e| e.alias_qualified(relation, name)), + span, + .. + }) => f(*expr)?.update_data(|e| { + let e = e.alias_qualified(relation, name); + if let Expr::Alias(alias) = e { + Expr::Alias(alias.with_span(span)) + } else { + unreachable!(); + } + }), Expr::InSubquery(InSubquery { expr, subquery, diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs index 225b6098a9eb..bd883b2c2033 100644 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -124,28 +124,23 @@ fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { ref relation, ref name, ref spans, + .. }) => { if name.eq("*") { let expanded_columns = if let Some(qualifier) = relation { - expand_qualified_wildcard( - qualifier, - input.schema(), - None, - )? + expand_qualified_wildcard(qualifier, input.schema(), None)? } else { - expand_wildcard( - input.schema(), - input, - None, - )? + expand_wildcard(input.schema(), input, None)? }; - projected_expr.extend(expanded_columns.into_iter().map(|mut expr| { - if let Expr::Column(c) = &mut expr { - c.spans = spans.clone(); - } - expr - })); + projected_expr.extend(expanded_columns.into_iter().map( + |mut expr| { + if let Expr::Column(c) = &mut expr { + c.spans = spans.clone(); + } + expr + }, + )); } else { projected_expr.push(e.clone()); } diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index b585beb1e7d5..904face71bec 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1001,6 +1001,7 @@ fn project_with_column_index( relation: _, ref name, spans: _, + .. }) if name != schema.field(i).name() => e.alias(schema.field(i).name()), Expr::Alias { .. } | Expr::Column { .. } => e, _ => e.alias(schema.field(i).name()), diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index 5c519dce8fbb..050084964dba 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -532,7 +532,12 @@ fn proj_exprs_evaluation_result_on_empty_batch( let result_expr = simplifier.simplify(result_expr)?; let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), - Expr::Column(Column { relation: _, name, spans: _ }) => name.to_string(), + Expr::Column(Column { + relation: _, + name, + spans: _, + .. + }) => name.to_string(), _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 1519c54dbf68..81e60ffd9da5 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -493,6 +493,7 @@ fn merge_consecutive_projections(proj: Projection) -> Result rewrite_expr(*expr, &prev_projection).map(|result| { result.update_data(|expr| Expr::Alias(Alias::new(expr, relation, name))) }), diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs index 4787ba8d4ca7..992a926cd91f 100644 --- a/datafusion/optimizer/src/replace_distinct_aggregate.rs +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -157,10 +157,15 @@ impl OptimizerRule for ReplaceDistinctWithAggregate { .iter() .skip(expr_cnt) .zip(schema.iter()) - .map(|((new_qualifier, new_field, _), (old_qualifier, old_field, _))| { - col(Column::from((new_qualifier, new_field))) - .alias_qualified(old_qualifier.cloned(), old_field.name()) - }) + .map( + |( + (new_qualifier, new_field, _), + (old_qualifier, old_field, _), + )| { + col(Column::from((new_qualifier, new_field))) + .alias_qualified(old_qualifier.cloned(), old_field.name()) + }, + ) .collect::>(); let plan = LogicalPlanBuilder::from(plan) diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 1b9583516ced..adad67031487 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -266,7 +266,7 @@ impl TryFrom<&DFSchema> for protobuf::DfSchema { fn try_from(s: &DFSchema) -> Result { let columns = s .iter() - .map(|(qualifier, field)| { + .map(|(qualifier, field, _)| { Ok(protobuf::DfField { field: Some(field.as_ref().try_into()?), qualifier: qualifier.map(|r| protobuf::ColumnRelation { diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 735221eae6d5..7ee5bd45a058 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -20,8 +20,7 @@ use sqlparser::ast::{Expr as SQLExpr, Ident}; use datafusion_common::{ internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, - DataFusionError, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, - TableReference, + DataFusionError, Result, TableReference, }; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{Case, Expr}; @@ -114,7 +113,7 @@ impl SqlToRel<'_, S> { })?; Ok(Expr::ScalarVariable(ty, var_names)) } else { - let span = ids.last().map(|id| id.span).unwrap_or(Span::empty()); + let span = Span::union_iter(ids.iter().map(|id| id.span)); let ids = ids .into_iter() .map(|id| self.ident_normalizer.normalize(id)) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index c08a9fd44e04..aa4dbb37e117 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -157,7 +157,9 @@ impl SqlToRel<'_, S> { } _ => false, }) { - Some((qualifier, df_field, _)) => Expr::from((qualifier, df_field)), + Some((qualifier, df_field, _)) => { + Expr::from((qualifier, df_field)) + } None => Expr::Column(col), } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 33b3b97044af..ad30fc3bbb5d 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -394,7 +394,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { col.name, r ), DiagnosticEntryKind::Error, - col.spans.first().copied().unwrap_or(Span::empty()), + col.spans + .first() + .copied() + .unwrap_or(Span::empty()), )]) }) }, diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs index 2ed1197e8fbf..faec947ce5d8 100644 --- a/datafusion/sql/src/relation/join.rs +++ b/datafusion/sql/src/relation/join.rs @@ -16,8 +16,8 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, Column, Result}; -use datafusion_expr::{JoinType, LogicalPlan, LogicalPlanBuilder}; +use datafusion_common::{not_impl_err, Column, DataFusionError, Result, ScalarValue}; +use datafusion_expr::{Expr, JoinType, LogicalPlan, LogicalPlanBuilder}; use sqlparser::ast::{Join, JoinConstraint, JoinOperator, TableFactor, TableWithJoins}; use std::collections::HashSet; @@ -26,19 +26,21 @@ impl SqlToRel<'_, S> { &self, t: TableWithJoins, planner_context: &mut PlannerContext, - ) -> Result { + ) -> Result<(LogicalPlan, Option)> { let mut left = if is_lateral(&t.relation) { self.create_relation_subquery(t.relation, planner_context)? } else { self.create_relation(t.relation, planner_context)? }; + let mut ignorable_error = None; let old_outer_from_schema = planner_context.outer_from_schema(); for join in t.joins { planner_context.extend_outer_from_schema(left.schema())?; - left = self.parse_relation_join(left, join, planner_context)?; + (left, ignorable_error) = + self.parse_relation_join(left, join, planner_context)?; } planner_context.set_outer_from_schema(old_outer_from_schema); - Ok(left) + Ok((left, ignorable_error)) } fn parse_relation_join( @@ -46,7 +48,7 @@ impl SqlToRel<'_, S> { left: LogicalPlan, join: Join, planner_context: &mut PlannerContext, - ) -> Result { + ) -> Result<(LogicalPlan, Option)> { let right = if is_lateral_join(&join)? { self.create_relation_subquery(join.relation, planner_context)? } else { @@ -93,7 +95,10 @@ impl SqlToRel<'_, S> { JoinOperator::FullOuter(constraint) => { self.parse_join(left, right, constraint, JoinType::Full, planner_context) } - JoinOperator::CrossJoin => self.parse_cross_join(left, right), + JoinOperator::CrossJoin => { + let plan = self.parse_cross_join(left, right)?; + Ok((plan, None)) + } other => not_impl_err!("Unsupported JOIN operator {other:?}"), } } @@ -113,24 +118,32 @@ impl SqlToRel<'_, S> { constraint: JoinConstraint, join_type: JoinType, planner_context: &mut PlannerContext, - ) -> Result { + ) -> Result<(LogicalPlan, Option)> { match constraint { JoinConstraint::On(sql_expr) => { let join_schema = left.schema().join(right.schema())?; // parse ON expression - let expr = self.sql_to_expr(sql_expr, &join_schema, planner_context)?; - LogicalPlanBuilder::from(left) + let (expr, ignorable_error) = + match self.sql_to_expr(sql_expr, &join_schema, planner_context) { + Ok(expr) => (expr, None), + Err(e) => { + (Expr::Literal(ScalarValue::Boolean(Some(false))), Some(e)) + } + }; + let plan = LogicalPlanBuilder::from(left) .join_on(right, join_type, Some(expr))? - .build() + .build()?; + Ok((plan, ignorable_error)) } JoinConstraint::Using(idents) => { let keys: Vec = idents .into_iter() .map(|x| Column::from_name(self.ident_normalizer.normalize(x))) .collect(); - LogicalPlanBuilder::from(left) + let plan = LogicalPlanBuilder::from(left) .join_using(right, join_type, keys)? - .build() + .build()?; + Ok((plan, None)) } JoinConstraint::Natural => { let left_cols: HashSet<&String> = @@ -143,17 +156,21 @@ impl SqlToRel<'_, S> { .filter(|f| left_cols.contains(f)) .map(Column::from_name) .collect(); - if keys.is_empty() { - self.parse_cross_join(left, right) + let plan = if keys.is_empty() { + self.parse_cross_join(left, right)? } else { LogicalPlanBuilder::from(left) .join_using(right, join_type, keys)? - .build() - } + .build()? + }; + Ok((plan, None)) + } + JoinConstraint::None => { + let plan = LogicalPlanBuilder::from(left) + .join_on(right, join_type, [])? + .build()?; + Ok((plan, None)) } - JoinConstraint::None => LogicalPlanBuilder::from(left) - .join_on(right, join_type, [])? - .build(), } } } diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 45a617daae96..56402aa0353a 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -100,7 +100,14 @@ impl SqlToRel<'_, S> { table_with_joins, alias, } => ( - self.plan_table_with_joins(*table_with_joins, planner_context)?, + { + let (plan, ignorable_error) = + self.plan_table_with_joins(*table_with_joins, planner_context)?; + if let Some(e) = ignorable_error { + return Err(e); + } + plan + }, alias, ), TableFactor::UNNEST { diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index b9ffce0b2af6..af3ed62ed7a8 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -27,7 +27,8 @@ use crate::utils::{ use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::{ - not_impl_err, plan_err, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, + not_impl_err, plan_err, DataFusionError, Diagnostic, DiagnosticEntry, + DiagnosticEntryKind, Result, }; use datafusion_common::{RecursionUnnestOption, UnnestOptions}; use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; @@ -75,8 +76,17 @@ impl SqlToRel<'_, S> { return not_impl_err!("SORT BY"); } + let mut errs = vec![]; + // Process `from` clause - let plan = self.plan_from_tables(select.from, planner_context)?; + let plan = { + let (plan, ignorable_error) = + self.plan_from_tables(select.from, planner_context)?; + if let Some(err) = ignorable_error { + errs.push(err); + } + plan + }; let empty_from = matches!(plan, LogicalPlan::EmptyRelation(_)); // Process `where` clause @@ -198,13 +208,21 @@ impl SqlToRel<'_, S> { .is_empty() || !aggr_exprs.is_empty() { - self.aggregate( + match self.aggregate( &base_plan, &select_exprs, having_expr_opt.as_ref(), &group_by_exprs, &aggr_exprs, - )? + ) { + Err(err) => { + errs.push(err); + (base_plan.clone(), select_exprs.clone(), having_expr_opt) + } + Ok((plan, select_exprs_post_aggr, having_expr_post_aggr)) => { + (plan, select_exprs_post_aggr, having_expr_post_aggr) + } + } } else { match having_expr_opt { Some(having_expr) => return plan_err!("HAVING clause references: {having_expr} must appear in the GROUP BY clause or be used in an aggregate function"), @@ -288,7 +306,13 @@ impl SqlToRel<'_, S> { plan }; - self.order_by(plan, order_by_rex) + let plan = self.order_by(plan, order_by_rex)?; + + if !errs.is_empty() { + Err(DataFusionError::Collection(errs)) + } else { + Ok(plan) + } } /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection @@ -537,9 +561,12 @@ impl SqlToRel<'_, S> { &self, mut from: Vec, planner_context: &mut PlannerContext, - ) -> Result { + ) -> Result<(LogicalPlan, Option)> { match from.len() { - 0 => Ok(LogicalPlanBuilder::empty(true).build()?), + 0 => { + let plan = LogicalPlanBuilder::empty(true).build()?; + Ok((plan, None)) + } 1 => { let input = from.remove(0); self.plan_table_with_joins(input, planner_context) @@ -547,24 +574,49 @@ impl SqlToRel<'_, S> { _ => { let mut from = from.into_iter(); - let mut left = LogicalPlanBuilder::from({ - let input = from.next().unwrap(); - self.plan_table_with_joins(input, planner_context)? - }); + let mut ignorable_error = None; + let mut add_ignorable_error = |new_ignorable_err: Option< + DataFusionError, + >| match new_ignorable_err { + None => (), + Some(new_ignorable_err) => { + if ignorable_error.is_none() { + ignorable_error = Some(DataFusionError::Collection(vec![])); + } + if let Some(DataFusionError::Collection(ignorable_errors)) = + &mut ignorable_error + { + ignorable_errors.push(new_ignorable_err); + } + } + }; + + let left = { + let (left, ignorable_error) = self + .plan_table_with_joins(from.next().unwrap(), planner_context)?; + add_ignorable_error(ignorable_error); + left + }; + let mut left = LogicalPlanBuilder::from(left); let old_outer_from_schema = { let left_schema = Some(Arc::clone(left.schema())); planner_context.set_outer_from_schema(left_schema) }; for input in from { // Join `input` with the current result (`left`). - let right = self.plan_table_with_joins(input, planner_context)?; + let right = { + let (right, ignorable_error) = + self.plan_table_with_joins(input, planner_context)?; + add_ignorable_error(ignorable_error); + right + }; left = left.cross_join(right)?; // Update the outer FROM schema. let left_schema = Some(Arc::clone(left.schema())); planner_context.set_outer_from_schema(left_schema); } planner_context.set_outer_from_schema(old_outer_from_schema); - left.build() + Ok((left.build()?, ignorable_error)) } } } @@ -606,6 +658,7 @@ impl SqlToRel<'_, S> { Ok(vec![col]) } SelectItem::ExprWithAlias { expr, alias } => { + let alias_span = alias.span; let select_expr = self.sql_to_expr(expr, plan.schema(), planner_context)?; let col = normalize_col_with_schemas_and_ambiguity_check( @@ -617,7 +670,14 @@ impl SqlToRel<'_, S> { // avoiding adding an alias if the column name is the same. let expr = match &col { Expr::Column(column) if column.name.eq(&name) => col, - _ => col.alias(name), + _ => { + let expr = col.alias(name); + if let Expr::Alias(alias) = expr { + Expr::Alias(alias.with_span(alias_span)) + } else { + unreachable!(); + } + } }; Ok(vec![expr]) } @@ -815,11 +875,7 @@ impl SqlToRel<'_, S> { Diagnostic::new([DiagnosticEntry::new( "GROUP BY clause is here", DiagnosticEntryKind::Note, - Span::union_iter( - group_by_exprs - .iter() - .flat_map(|e| e.get_spans().cloned().unwrap_or_default()), - ), + Span::union_iter(group_by_exprs.iter().map(|e| e.get_span())), )]) }) })?; diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 572ac8221b1c..330fdfc39e83 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1714,7 +1714,14 @@ impl SqlToRel<'_, S> { // Build scan, join with from table if it exists. let mut input_tables = vec![table]; input_tables.extend(from); - let scan = self.plan_from_tables(input_tables, &mut planner_context)?; + let scan = { + let (scan, ignorable_error) = + self.plan_from_tables(input_tables, &mut planner_context)?; + if let Some(ignorable_error) = ignorable_error { + return Err(ignorable_error); + } + scan + }; // Filter let source = match predicate_expr { diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 04b8a99eacc9..b4ada73143e4 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -924,11 +924,13 @@ impl Unparser<'_> { relation: _, name: left_name, spans: _, + .. }), Expr::Column(Column { relation: _, name: right_name, spans: _, + .. }), ) if left_name == right_name => { idents.push(self.new_ident_quoted_if_needs(left_name.to_string())); diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs index 68af121a4117..4dc56beb5adc 100644 --- a/datafusion/sql/src/unparser/rewrite.rs +++ b/datafusion/sql/src/unparser/rewrite.rs @@ -311,6 +311,7 @@ pub(super) fn inject_column_aliases( expr: Box::new(expr.clone()), relation, name: col_alias.value, + span: col_alias.span, }) }) .collect::>(); diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 432d04510c6a..fa4ba20c4676 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -123,20 +123,25 @@ pub(crate) fn check_columns_satisfy_exprs( Expr::Column(_) => Ok(()), _ => internal_err!("Expr::Column are required"), })?; + let mut errs = vec![]; let column_exprs = find_column_exprs(exprs); for e in &column_exprs { match e { Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { for e in exprs { - if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + if let Err(err) = + check_column_satisfies_expr(columns, e, call_purpose) + { errs.push(err); } } } Expr::GroupingSet(GroupingSet::Cube(exprs)) => { for e in exprs { - if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + if let Err(err) = + check_column_satisfies_expr(columns, e, call_purpose) + { errs.push(err); } } @@ -144,15 +149,19 @@ pub(crate) fn check_columns_satisfy_exprs( Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { for exprs in lists_of_exprs { for e in exprs { - if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + if let Err(err) = + check_column_satisfies_expr(columns, e, call_purpose) + { errs.push(err); } } } } - _ => if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { - errs.push(err); - }, + _ => { + if let Err(err) = check_column_satisfies_expr(columns, e, call_purpose) { + errs.push(err); + } + } } } if !errs.is_empty() { @@ -177,11 +186,11 @@ fn check_column_satisfies_expr( let message = match call_purpose { CheckColumnsSatisfyExprsPurpose::GroupBy => format!( "'{}' in projection does not appear in GROUP BY clause", - expr.to_string() + expr ), CheckColumnsSatisfyExprsPurpose::Having => format!( "'{}' in HAVING clause does not appear in GROUP BY clause", - expr.to_string() + expr ), }; err.with_diagnostic(|_| { @@ -189,12 +198,10 @@ fn check_column_satisfies_expr( DiagnosticEntry::new( message, DiagnosticEntryKind::Error, - expr.get_spans() - .and_then(|spans| spans.first().copied()) - .unwrap_or(Span::empty()), + expr.get_span(), ), DiagnosticEntry::new( - format!("Add '{}' to GROUP BY clause", expr.to_string()), + format!("Add '{}' to GROUP BY clause", expr), DiagnosticEntryKind::Help, Span::empty(), ),