diff --git a/datafusion-examples/examples/parquet_sql_multiple_files.rs b/datafusion-examples/examples/parquet_sql_multiple_files.rs index 451de96f2e91..0e2968f20356 100644 --- a/datafusion-examples/examples/parquet_sql_multiple_files.rs +++ b/datafusion-examples/examples/parquet_sql_multiple_files.rs @@ -17,31 +17,35 @@ use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; -use datafusion::error::Result; use datafusion::prelude::*; -use datafusion_common::{FileType, GetExt}; +use object_store::local::LocalFileSystem; +use std::path::Path; use std::sync::Arc; /// This example demonstrates executing a simple query against an Arrow data source (a directory -/// with multiple Parquet files) and fetching results +/// with multiple Parquet files) and fetching results. The query is run twice, once showing +/// how to used `register_listing_table` with an absolute path, and once registering an +/// ObjectStore to use a relative path. #[tokio::main] -async fn main() -> Result<()> { +async fn main() -> Result<(), Box> { // create local execution context let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let test_data = datafusion::test_util::parquet_test_data(); // Configure listing options let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension(FileType::PARQUET.get_ext()); + // This is a workaround for this example since `test_data` contains + // many different parquet different files, + // in practice use FileType::PARQUET.get_ext(). + .with_file_extension("alltypes_plain.parquet"); - // Register a listing table - this will use all files in the directory as data sources - // for the query + // First example were we use an absolute path, which requires no additional setup. ctx.register_listing_table( "my_table", - &format!("file://{testdata}/alltypes_plain.parquet"), - listing_options, + &format!("file://{test_data}/"), + listing_options.clone(), None, None, ) @@ -60,5 +64,47 @@ async fn main() -> Result<()> { // print the results df.show().await?; + // Second example were we temporarily move into the test data's parent directory and + // simulate a relative path, this requires registering an ObjectStore. + let cur_dir = std::env::current_dir()?; + + let test_data_path = Path::new(&test_data); + let test_data_path_parent = test_data_path + .parent() + .ok_or("test_data path needs a parent")?; + + std::env::set_current_dir(test_data_path_parent)?; + + let local_fs = Arc::new(LocalFileSystem::default()); + + let u = url::Url::parse("file://./")?; + ctx.runtime_env().register_object_store(&u, local_fs); + + // Register a listing table - this will use all files in the directory as data sources + // for the query + ctx.register_listing_table( + "relative_table", + "./data", + listing_options.clone(), + None, + None, + ) + .await?; + + // execute the query + let df = ctx + .sql( + "SELECT * \ + FROM relative_table \ + LIMIT 1", + ) + .await?; + + // print the results + df.show().await?; + + // Reset the current directory + std::env::set_current_dir(cur_dir)?; + Ok(()) }