apache · eliaperantoni · Nov 24, 2024 · Nov 24, 2024 · Dec 5, 2024 · Dec 6, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -151,6 +151,7 @@ sqlparser = { version = "0.52.0", features = ["visitor"] }
 tempfile = "3"
 tokio = { version = "1.36", features = ["macros", "rt", "sync"] }
 url = "2.2"
+derivative = "2.2.0"
 
 [profile.release]
 codegen-units = 1
@@ -177,3 +178,7 @@ large_futures = "warn"
 [workspace.lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
 unused_qualifications = "deny"
+
+## Temp patch for sqlparser
+[patch.crates-io]
+sqlparser = { git = "https://github.com/apache/datafusion-sqlparser-rs.git", rev = "4ab3ab91473d152c652e6582b63abb13535703f9" }
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
@@ -65,6 +65,7 @@ pyo3 = { version = "0.22.0", optional = true }
 recursive = { workspace = true }
 sqlparser = { workspace = true }
 tokio = { workspace = true }
+derivative = { workspace = true }
 
 [target.'cfg(target_family = "wasm")'.dependencies]
 web-time = "1.1.0"

diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs
@@ -18,22 +18,38 @@
 //! Column
 
 use arrow_schema::{Field, FieldRef};
+use sqlparser::tokenizer::Span;
 
 use crate::error::_schema_err;
 use crate::utils::{parse_identifiers_normalized, quote_identifier};
-use crate::{DFSchema, Result, SchemaError, TableReference};
+use crate::{
+    DFSchema, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, Result, SchemaError,
+    TableReference,
+};
+use derivative::Derivative;
 use std::collections::HashSet;
 use std::convert::Infallible;
 use std::fmt;
 use std::str::FromStr;
 
 /// A named reference to a qualified field in a schema.
-#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[derive(Debug, Derivative)]
+#[derivative(PartialEq, Eq, Hash, PartialOrd, Ord, Clone)]
 pub struct Column {
     /// relation/table reference.
     pub relation: Option<TableReference>,
     /// field/column name.
     pub name: String,
+    #[derivative(
+        PartialEq = "ignore",
+        Hash = "ignore",
+        PartialOrd = "ignore",
+        Ord = "ignore"
+    )]
+    /// The location in the source code where this column originally came from.
+    /// Does not change as the [`Column`] undergoes normalization and other
+    /// transformations.
+    pub spans: Vec<Span>,
 }
 
 impl Column {
@@ -50,6 +66,7 @@ impl Column {
         Self {
             relation: relation.map(|r| r.into()),
             name: name.into(),
+            spans: vec![],
         }
     }
 
@@ -58,6 +75,7 @@ impl Column {
         Self {
             relation: None,
             name: name.into(),
+            spans: vec![],
         }
     }
 
@@ -68,6 +86,7 @@ impl Column {
         Self {
             relation: None,
             name: name.into(),
+            spans: vec![],
         }
     }
 
@@ -99,7 +118,11 @@ impl Column {
             // identifiers will be treated as an unqualified column name
             _ => return None,
         };
-        Some(Self { relation, name })
+        Some(Self {
+            relation,
+            name,
+            spans: vec![],
+        })
     }
 
     /// Deserialize a fully qualified name string into a column
@@ -113,6 +136,7 @@ impl Column {
             Self {
                 relation: None,
                 name: flat_name,
+                spans: vec![],
             },
         )
     }
@@ -124,6 +148,7 @@ impl Column {
             Self {
                 relation: None,
                 name: flat_name,
+                spans: vec![],
             },
         )
     }
@@ -239,7 +264,33 @@ impl Column {
 
                     // If not due to USING columns then due to ambiguous column name
                     return _schema_err!(SchemaError::AmbiguousReference {
-                        field: Column::new_unqualified(self.name),
+                        field: Column::new_unqualified(&self.name),
+                    })
+                    .map_err(|err| {
+                        err.with_diagnostic(|_| {
+                            Diagnostic::new(
+                                [DiagnosticEntry::new(
+                                    format!("Ambiguous reference to '{}'", &self.name),
+                                    DiagnosticEntryKind::Error,
+                                    *self.spans.first().unwrap_or(&Span::empty()),
+                                )]
+                                .into_iter()
+                                .chain(
+                                    columns.iter().flat_map(|c| {
+                                        c.spans.iter().map(|span| {
+                                            DiagnosticEntry::new(
+                                                format!(
+                                                    "Possible reference to '{}' here",
+                                                    &c.name
+                                                ),
+                                                DiagnosticEntryKind::Note,
+                                                *span,
+                                            )
+                                        })
+                                    }),
+                                ),
+                            )
+                        })
                     });
                 }
             }
@@ -254,6 +305,23 @@ impl Column {
                 .collect(),
         })
     }
+
+    /// Attaches a [`Span`] to the [`Column`], i.e. its location in the source
+    /// SQL query.
+    pub fn with_span(mut self, span: Span) -> Self {
+        self.spans = vec![span];
+        self
+    }
+
+    /// Attaches multiple [`Span`]s to the [`Column`], i.e. its locations in the
+    /// source SQL query.
+    pub fn with_spans<IT>(mut self, spans: IT) -> Self
+    where
+        IT: IntoIterator<Item = Span>,
+    {
+        self.spans = spans.into_iter().collect();
+        self
+    }
 }
 
 impl From<&str> for Column {

diff --git a/datafusion/common/src/dfschema/fields_spans.rs b/datafusion/common/src/dfschema/fields_spans.rs
@@ -0,0 +1,42 @@
+use sqlparser::tokenizer::Span;
+
+use crate::JoinType;
+
+/// The location in the source code where the fields are defined (e.g. in the
+/// body of a CTE), if any.
+#[derive(Debug, Clone)]
+pub struct FieldsSpans(Vec<Vec<Span>>);
+
+impl FieldsSpans {
+    pub fn empty(field_count: usize) -> Self {
+        Self((0..field_count).map(|_| Vec::new()).collect())
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = &Vec<Span>> {
+        self.0.iter()
+    }
+
+    pub fn join(
+        &self,
+        other: &FieldsSpans,
+        join_type: &JoinType,
+        _left_cols_len: usize,
+    ) -> FieldsSpans {
+        match join_type {
+            JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
+                FieldsSpans(self.0.iter().chain(other.0.iter()).cloned().collect())
+            }
+            JoinType::LeftSemi => todo!(),
+            JoinType::RightSemi => todo!(),
+            JoinType::LeftAnti => todo!(),
+            JoinType::RightAnti => todo!(),
+            JoinType::LeftMark => todo!(),
+        }
+    }
+}
+
+impl FromIterator<Vec<Span>> for FieldsSpans {
+    fn from_iter<T: IntoIterator<Item = Vec<Span>>>(iter: T) -> Self {
+        FieldsSpans(iter.into_iter().collect())
+    }
+}