Skip to content

Commit

Permalink
chore: break Glue support into its own crate without rusoto (delta-io…
Browse files Browse the repository at this point in the history
…#1825)

This change also pilots a removal of Rusoto in favor of the AWS SDK for
Rust which AWS is now supporting and funding the development of.

The API surface is largely the same, but this move I believe will aso
ensure that we're much more consistent on handling AWS environment
variables for some things.

Related to delta-io#1601

---------

Co-authored-by: Robert Pack <[email protected]>
  • Loading branch information
rtyler and roeap authored Dec 11, 2023
1 parent f69d36f commit 6d07bc5
Show file tree
Hide file tree
Showing 11 changed files with 184 additions and 158 deletions.
21 changes: 21 additions & 0 deletions crates/deltalake-catalog-glue/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[package]
name = "deltalake-catalog-glue"
version = "0.1.0"
edition = "2021"

[dependencies]
async-trait = { workspace = true }
aws-config = "0.57.1"
aws-sdk-glue = "0.35.0"
deltalake-core = { path = "../deltalake-core" }
# This can depend on a lowest common denominator of core once that's released
# deltalake_core = { version = "0.17.0" }
log = "0.4"
thiserror = { workspace = true }

[dev-dependencies]
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }

[features]
default = []
native-tls = []
23 changes: 23 additions & 0 deletions crates/deltalake-catalog-glue/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

.PHONY: help
help: ## Show this help
@egrep -h '\s##\s' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'

.PHONY: all build check test clean
all: check build test ## Perform all the checks builds and testing

check: ## Ensure that the crate meets the basic formatting and structure
cargo fmt --check
cargo clippy
cargo clippy --features native-tls --no-default-features

build: ## Build the crate with each set of features
cargo build
cargo build --features native-tls --no-default-features

test: ## Run the crate's tests with each set of features
cargo test
cargo test --features native-tls --no-default-features

clean: ## Clean up resources from build
cargo clean
20 changes: 20 additions & 0 deletions crates/deltalake-catalog-glue/examples/demo.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use deltalake_catalog_glue::*;
use deltalake_core::*;

#[tokio::main]
async fn main() {
println!("Reading a table");

let catalog = GlueDataCatalog::from_env()
.await
.expect("Failed to load catalog from the environment");
println!("catalog: {catalog:?}");

println!(
"read: {:?}",
catalog
.get_table_storage_location(None, "database", "table")
.await
.expect("Failed")
);
}
115 changes: 115 additions & 0 deletions crates/deltalake-catalog-glue/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
//! Glue Data Catalog.
//!
use aws_config::SdkConfig;
use deltalake_core::data_catalog::{DataCatalog, DataCatalogError};
use log::*;

#[derive(thiserror::Error, Debug)]
pub enum GlueError {
/// Missing metadata in the catalog
#[error("Missing Metadata {metadata} in the Data Catalog ")]
MissingMetadata {
/// The missing metadata property
metadata: String,
},

/// Error calling the AWS SDK
#[error("Failed in an AWS SDK call")]
AWSError {
#[from]
source: aws_sdk_glue::Error,
},
}

impl From<GlueError> for DataCatalogError {
fn from(val: GlueError) -> Self {
DataCatalogError::Generic {
catalog: "glue",
source: Box::new(val),
}
}
}

/// A Glue Data Catalog implement of the `Catalog` trait
pub struct GlueDataCatalog {
client: aws_sdk_glue::Client,
}

impl GlueDataCatalog {
/// Creates a new GlueDataCatalog with environmental configuration
pub async fn from_env() -> Result<Self, GlueError> {
let config = aws_config::load_from_env().await;
let client = aws_sdk_glue::Client::new(&config);
Ok(Self { client })
}

/// Create a new [GlueDataCatalog] with the given [aws_config::SdkConfig]
pub fn with_config(config: &SdkConfig) -> Self {
let client = aws_sdk_glue::Client::new(config);
Self { client }
}
}

impl std::fmt::Debug for GlueDataCatalog {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(fmt, "GlueDataCatalog")
}
}

// Placeholder suffix created by Spark in the Glue Data Catalog Location
const PLACEHOLDER_SUFFIX: &str = "-__PLACEHOLDER__";

#[async_trait::async_trait]
impl DataCatalog for GlueDataCatalog {
/// Get the table storage location from the Glue Data Catalog
async fn get_table_storage_location(
&self,
catalog_id: Option<String>,
database_name: &str,
table_name: &str,
) -> Result<String, DataCatalogError> {
let mut builder = self
.client
.get_table()
.database_name(database_name)
.name(table_name);

if let Some(catalog) = catalog_id {
builder = builder.catalog_id(catalog);
}

let response = builder
.send()
.await
.map_err(|e| GlueError::AWSError { source: e.into() })
.map_err(<GlueError as Into<DataCatalogError>>::into)?;

let location = response
.table
.ok_or(GlueError::MissingMetadata {
metadata: "Table".to_string(),
})
.map_err(<GlueError as Into<DataCatalogError>>::into)?
.storage_descriptor
.ok_or(GlueError::MissingMetadata {
metadata: "Storage Descriptor".to_string(),
})
.map_err(<GlueError as Into<DataCatalogError>>::into)?
.location
.map(|l| l.replace("s3a", "s3"))
.ok_or(GlueError::MissingMetadata {
metadata: "Location".to_string(),
});

match location {
Ok(location) => {
if location.ends_with(PLACEHOLDER_SUFFIX) {
Ok(location[..location.len() - PLACEHOLDER_SUFFIX.len()].to_string())
} else {
Ok(location)
}
}
Err(err) => Err(err.into()),
}
}
}
6 changes: 1 addition & 5 deletions crates/deltalake-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ edition = "2021"
[package.metadata.docs.rs]
# We cannot use all_features because TLS features are mutually exclusive.
# We cannot use hdfs feature because it requires Java to be installed.
features = ["azure", "datafusion", "gcs", "glue", "hdfs", "json", "python", "s3", "unity-experimental"]
features = ["azure", "datafusion", "gcs", "hdfs", "json", "python", "s3", "unity-experimental"]

[dependencies]
# arrow
Expand Down Expand Up @@ -101,8 +101,6 @@ rusoto_credential = { version = "0.47", optional = true }
rusoto_sts = { version = "0.47", default-features = false, optional = true }
deltalake-aws = { path = "../deltalake-aws", default-features = false, optional = true }

# Glue
rusoto_glue = { version = "0.47", default-features = false, optional = true }

# Unity
reqwest = { version = "0.11.18", default-features = false, features = [
Expand Down Expand Up @@ -162,8 +160,6 @@ datafusion = [
]
datafusion-ext = ["datafusion"]
gcs = ["object_store/gcp"]
glue = ["s3", "rusoto_glue/rustls", "tracing", "hyper"]
glue-native-tls = ["s3-native-tls", "rusoto_glue", "tracing", "hyper"]
hdfs = ["datafusion-objectstore-hdfs"]
# used only for integration testing
integration_test = ["fs_extra", "tempdir"]
Expand Down
1 change: 0 additions & 1 deletion crates/deltalake-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ cargo run --example read_delta_table
- `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
- `datafusion-ext` - DEPRECATED: alias for `datafusion` feature
- `gcs` - enable the Google storage backend to work with Delta Tables in Google Cloud Storage.
- `glue` - enable the Glue data catalog to work with Delta Tables with AWS Glue.
- `hdfs` - enable the HDFS storage backend to work with Delta Tables in HDFS.
- `json` - enable the JSON feature of the `parquet` crate for better JSON interoperability.
- `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`.
Expand Down
110 changes: 0 additions & 110 deletions crates/deltalake-core/src/data_catalog/glue/mod.rs

This file was deleted.

38 changes: 0 additions & 38 deletions crates/deltalake-core/src/data_catalog/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ pub use unity::*;

#[cfg(feature = "unity-experimental")]
pub mod client;
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
pub mod glue;
#[cfg(feature = "datafusion")]
pub mod storage;
#[cfg(feature = "unity-experimental")]
Expand All @@ -25,7 +23,6 @@ pub enum DataCatalogError {
Generic {
/// Name of the catalog
catalog: &'static str,

/// Error message
source: Box<dyn std::error::Error + Send + Sync + 'static>,
},
Expand All @@ -48,41 +45,6 @@ pub enum DataCatalogError {
source: reqwest::Error,
},

/// Missing metadata in the catalog
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Missing Metadata {metadata} in the Data Catalog ")]
MissingMetadata {
/// The missing metadata property
metadata: String,
},

/// Glue Glue Data Catalog Error
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Catalog glue error: {source}")]
GlueError {
/// The underlying Glue Data Catalog Error
#[from]
source: rusoto_core::RusotoError<rusoto_glue::GetTableError>,
},

/// Error caused by the http request dispatcher not being able to be created.
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Failed to create request dispatcher: {source}")]
AWSHttpClient {
/// The underlying Rusoto TlsError
#[from]
source: rusoto_core::request::TlsError,
},

/// Error representing a failure to retrieve AWS credentials.
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Failed to retrieve AWS credentials: {source}")]
AWSCredentials {
/// The underlying Rusoto CredentialsError
#[from]
source: rusoto_credential::CredentialsError,
},

/// Error caused by missing environment variable for Unity Catalog.
#[cfg(feature = "unity-experimental")]
#[error("Missing Unity Catalog environment variable: {var_name}")]
Expand Down
1 change: 0 additions & 1 deletion crates/deltalake-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
//! - `s3`, `gcs`, `azure` - enable the storage backends for AWS S3, Google Cloud Storage (GCS),
//! or Azure Blob Storage / Azure Data Lake Storage Gen2 (ADLS2). Use `s3-native-tls` to use native TLS
//! instead of Rust TLS implementation.
//! - `glue` - enable the Glue data catalog to work with Delta Tables with AWS Glue.
//! - `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation
//! for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
//! - `datafusion-ext` - DEPRECATED: alias for `datafusion` feature.
Expand Down
Loading

0 comments on commit 6d07bc5

Please sign in to comment.