Skip to content

Commit

Permalink
Merge branch 'main' into 13525/invariant-checking-for-implicit-LP-cha…
Browse files Browse the repository at this point in the history
…nges
  • Loading branch information
wiedld committed Dec 25, 2024
2 parents 9842d19 + e99e02b commit 00700ae
Show file tree
Hide file tree
Showing 162 changed files with 925 additions and 541 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ version = "43.0.0"
# selectively turn them on if needed, since we can override default-features = true (from false)
# for the inherited dependency but cannot do the reverse (override from true to false).
#
# See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
# See for more details: https://github.com/rust-lang/cargo/issues/11329
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ Default features:
- `regex_expressions`: regular expression functions, such as `regexp_match`
- `unicode_expressions`: Include unicode aware functions such as `character_length`
- `unparser`: enables support to reverse LogicalPlans back into SQL
- `recursive-protection`: uses [recursive](https://docs.rs/recursive/latest/recursive/) for stack overflow protection.
- `recursive_protection`: uses [recursive](https://docs.rs/recursive/latest/recursive/) for stack overflow protection.

Optional features:

Expand Down
1 change: 1 addition & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ datafusion = { path = "../datafusion/core", version = "43.0.0", features = [
"datetime_expressions",
"encoding_expressions",
"parquet",
"recursive_protection",
"regex_expressions",
"unicode_expressions",
"compression",
Expand Down
2 changes: 1 addition & 1 deletion datafusion-cli/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ impl TableFunctionImpl for ParquetMetadataFunc {
Field::new("total_uncompressed_size", DataType::Int64, true),
]));

// construct recordbatch from metadata
// construct record batch from metadata
let mut filename_arr = vec![];
let mut row_group_id_arr = vec![];
let mut row_group_num_rows_arr = vec![];
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
This crate includes end to end, highly commented examples of how to use
various DataFusion APIs to help you get started.

## Prerequisites:
## Prerequisites

Run `git submodule update --init` to init test files.

Expand Down
4 changes: 2 additions & 2 deletions datafusion-examples/examples/advanced_parquet_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ use url::Url;
/// Specifically, this example illustrates how to:
/// 1. Use [`ParquetFileReaderFactory`] to avoid re-reading parquet metadata on each query
/// 2. Use [`PruningPredicate`] for predicate analysis
/// 3. Pass a row group selection to [`ParuetExec`]
/// 3. Pass a row group selection to [`ParquetExec`]
/// 4. Pass a row selection (within a row group) to [`ParquetExec`]
///
/// Note this is a *VERY* low level example for people who want to build their
Expand Down Expand Up @@ -211,7 +211,7 @@ async fn main() -> Result<()> {
//
// Note: in order to prune pages, the Page Index must be loaded and the
// ParquetExec will load it on demand if not present. To avoid a second IO
// during query, this example loaded the Page Index pre-emptively by setting
// during query, this example loaded the Page Index preemptively by setting
// `ArrowReader::with_page_index` in `IndexedFile::try_new`
provider.set_use_row_selection(true);
println!("** Select data, predicate `id = 950`");
Expand Down
94 changes: 89 additions & 5 deletions datafusion-examples/examples/advanced_udwf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@ use arrow::{
};
use arrow_schema::Field;
use datafusion::error::Result;
use datafusion::functions_aggregate::average::avg_udaf;
use datafusion::prelude::*;
use datafusion_common::ScalarValue;
use datafusion_expr::function::WindowUDFFieldArgs;
use datafusion_expr::expr::WindowFunction;
use datafusion_expr::function::{WindowFunctionSimplification, WindowUDFFieldArgs};
use datafusion_expr::simplify::SimplifyInfo;
use datafusion_expr::{
PartitionEvaluator, Signature, WindowFrame, WindowUDF, WindowUDFImpl,
Expr, PartitionEvaluator, Signature, WindowFrame, WindowUDF, WindowUDFImpl,
};
use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;

Expand Down Expand Up @@ -142,6 +145,67 @@ impl PartitionEvaluator for MyPartitionEvaluator {
}
}

/// This UDWF will show how to use the WindowUDFImpl::simplify() API
#[derive(Debug, Clone)]
struct SimplifySmoothItUdf {
signature: Signature,
}

impl SimplifySmoothItUdf {
fn new() -> Self {
Self {
signature: Signature::exact(
// this function will always take one arguments of type f64
vec![DataType::Float64],
// this function is deterministic and will always return the same
// result for the same input
Volatility::Immutable,
),
}
}
}
impl WindowUDFImpl for SimplifySmoothItUdf {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"simplify_smooth_it"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn partition_evaluator(
&self,
_partition_evaluator_args: PartitionEvaluatorArgs,
) -> Result<Box<dyn PartitionEvaluator>> {
todo!()
}

/// this function will simplify `SimplifySmoothItUdf` to `AggregateUDF` for `Avg`
/// default implementation will not be called (left as `todo!()`)
fn simplify(&self) -> Option<WindowFunctionSimplification> {
let simplify = |window_function: WindowFunction, _: &dyn SimplifyInfo| {
Ok(Expr::WindowFunction(WindowFunction {
fun: datafusion_expr::WindowFunctionDefinition::AggregateUDF(avg_udaf()),
args: window_function.args,
partition_by: window_function.partition_by,
order_by: window_function.order_by,
window_frame: window_function.window_frame,
null_treatment: window_function.null_treatment,
}))
};

Some(Box::new(simplify))
}

fn field(&self, field_args: WindowUDFFieldArgs) -> Result<Field> {
Ok(Field::new(field_args.name(), DataType::Float64, true))
}
}

// create local execution context with `cars.csv` registered as a table named `cars`
async fn create_context() -> Result<SessionContext> {
// declare a new context. In spark API, this corresponds to a new spark SQL session
Expand All @@ -162,12 +226,15 @@ async fn main() -> Result<()> {
let smooth_it = WindowUDF::from(SmoothItUdf::new());
ctx.register_udwf(smooth_it.clone());

// Use SQL to run the new window function
let simplify_smooth_it = WindowUDF::from(SimplifySmoothItUdf::new());
ctx.register_udwf(simplify_smooth_it.clone());

// Use SQL to retrieve entire table
let df = ctx.sql("SELECT * from cars").await?;
// print the results
df.show().await?;

// Use SQL to run the new window function:
// Use SQL to run smooth_it:
//
// `PARTITION BY car`:each distinct value of car (red, and green)
// should be treated as a separate partition (and will result in
Expand Down Expand Up @@ -201,7 +268,7 @@ async fn main() -> Result<()> {
// print the results
df.show().await?;

// this time, call the new widow function with an explicit
// this time, call the function with an explicit
// window so evaluate will be invoked with each window.
//
// `ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING`: each invocation
Expand Down Expand Up @@ -232,5 +299,22 @@ async fn main() -> Result<()> {
// print the results
df.show().await?;

// Use SQL to run simplify_smooth_it
let df = ctx
.sql(
"SELECT \
car, \
speed, \
simplify_smooth_it(speed) OVER (PARTITION BY car ORDER BY time) AS smooth_speed,\
time \
from cars \
ORDER BY \
car",
)
.await?;

// print the results
df.show().await?;

Ok(())
}
4 changes: 2 additions & 2 deletions datafusion-examples/examples/analyzer_rule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ impl AnalyzerRule for RowLevelAccessControl {
fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result<LogicalPlan> {
// use the TreeNode API to recursively walk the LogicalPlan tree
// and all of its children (inputs)
let transfomed_plan = plan.transform(|plan| {
let transformed_plan = plan.transform(|plan| {
// This closure is called for each LogicalPlan node
// if it is a Scan node, add a filter to remove all managers
if is_employee_table_scan(&plan) {
Expand Down Expand Up @@ -166,7 +166,7 @@ impl AnalyzerRule for RowLevelAccessControl {
//
// This example does not need the value of either flag, so simply
// extract the LogicalPlan "data"
Ok(transfomed_plan.data)
Ok(transformed_plan.data)
}

fn name(&self) -> &str {
Expand Down
10 changes: 5 additions & 5 deletions datafusion-examples/examples/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ async fn main() -> Result<()> {

let ctx = SessionContext::new();
let state = ctx.state();
let cataloglist = Arc::new(CustomCatalogProviderList::new());
let catalog_list = Arc::new(CustomCatalogProviderList::new());

// use our custom catalog list for context. each context has a single catalog list.
// context will by default have [`MemoryCatalogProviderList`]
ctx.register_catalog_list(cataloglist.clone());
ctx.register_catalog_list(catalog_list.clone());

// initialize our catalog and schemas
let catalog = DirCatalog::new();
Expand Down Expand Up @@ -81,7 +81,7 @@ async fn main() -> Result<()> {
ctx.register_catalog("dircat", Arc::new(catalog));
{
// catalog was passed down into our custom catalog list since we override the ctx's default
let catalogs = cataloglist.catalogs.read().unwrap();
let catalogs = catalog_list.catalogs.read().unwrap();
assert!(catalogs.contains_key("dircat"));
};

Expand Down Expand Up @@ -144,8 +144,8 @@ impl DirSchema {
async fn create(state: &SessionState, opts: DirSchemaOpts<'_>) -> Result<Arc<Self>> {
let DirSchemaOpts { ext, dir, format } = opts;
let mut tables = HashMap::new();
let direntries = std::fs::read_dir(dir).unwrap();
for res in direntries {
let dir_entries = std::fs::read_dir(dir).unwrap();
for res in dir_entries {
let entry = res.unwrap();
let filename = entry.file_name().to_str().unwrap().to_string();
if !filename.ends_with(ext) {
Expand Down
4 changes: 2 additions & 2 deletions datafusion-examples/examples/expr_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ use datafusion_optimizer::analyzer::type_coercion::TypeCoercionRewriter;
/// 4. Simplify expressions: [`simplify_demo`]
/// 5. Analyze predicates for boundary ranges: [`range_analysis_demo`]
/// 6. Get the types of the expressions: [`expression_type_demo`]
/// 7. Apply type cocercion to expressions: [`type_coercion_demo`]
/// 7. Apply type coercion to expressions: [`type_coercion_demo`]
#[tokio::main]
async fn main() -> Result<()> {
// The easiest way to do create expressions is to use the
Expand Down Expand Up @@ -392,7 +392,7 @@ fn type_coercion_demo() -> Result<()> {
)?;
assert!(physical_expr.evaluate(&batch).is_ok());

// 4. Apply explict type coercion by manually rewriting the expression
// 4. Apply explicit type coercion by manually rewriting the expression
let coerced_expr = expr
.transform(|e| {
// Only type coerces binary expressions.
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/function_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use datafusion_expr::{
///
/// Apart from [FunctionFactory], this example covers
/// [ScalarUDFImpl::simplify()] which is often used at the same time, to replace
/// a function call with another expression at rutime.
/// a function call with another expression at runtime.
///
/// This example is rather simple and does not cover all cases required for a
/// real implementation.
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/memtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use std::sync::Arc;
use std::time::Duration;
use tokio::time::timeout;

/// This example demonstrates executing a simple query against a Memtable
/// This example demonstrates executing a simple query against a [`MemTable`]
#[tokio::main]
async fn main() -> Result<()> {
let mem_table = create_memtable()?;
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/optimizer_rule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ impl MyOptimizerRule {
// Closure called for each sub tree
match expr {
Expr::BinaryExpr(binary_expr) if is_binary_eq(&binary_expr) => {
// destruture the expression
// destructure the expression
let BinaryExpr { left, op: _, right } = binary_expr;
// rewrite to `my_eq(left, right)`
let udf = ScalarUDF::new_from_impl(MyEq::new());
Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/plan_to_sql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ fn simple_expr_to_sql_demo() -> Result<()> {
Ok(())
}

/// DataFusioon can remove parentheses when converting an expression to SQL.
/// DataFusion can remove parentheses when converting an expression to SQL.
/// Note that output is intended for humans, not for other SQL engines,
/// as difference in precedence rules can cause expressions to be parsed differently.
fn simple_expr_to_pretty_sql_demo() -> Result<()> {
Expand Down
6 changes: 3 additions & 3 deletions datafusion-examples/examples/simple_udtf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ impl TableFunctionImpl for LocalCsvTableFunc {
let limit = exprs
.get(1)
.map(|expr| {
// try to simpify the expression, so 1+2 becomes 3, for example
// try to simplify the expression, so 1+2 becomes 3, for example
let execution_props = ExecutionProps::new();
let info = SimplifyContext::new(&execution_props);
let expr = ExprSimplifier::new(info).simplify(expr.clone())?;
Expand Down Expand Up @@ -173,8 +173,8 @@ fn read_csv_batches(csv_path: impl AsRef<Path>) -> Result<(SchemaRef, Vec<Record
.with_header(true)
.build(file)?;
let mut batches = vec![];
for bacth in reader {
batches.push(bacth?);
for batch in reader {
batches.push(batch?);
}
let schema = Arc::new(schema);
Ok((schema, batches))
Expand Down
Loading

0 comments on commit 00700ae

Please sign in to comment.