Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: break Glue support into its own crate without rusoto #1825

Merged
merged 1 commit into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions crates/deltalake-catalog-glue/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[package]
name = "deltalake-catalog-glue"
version = "0.1.0"
edition = "2021"

[dependencies]
async-trait = { workspace = true }
aws-config = "0.57.1"
aws-sdk-glue = "0.35.0"
deltalake-core = { path = "../deltalake-core" }
# This can depend on a lowest common denominator of core once that's released
# deltalake_core = { version = "0.17.0" }
log = "0.4"
thiserror = { workspace = true }

[dev-dependencies]
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }

[features]
default = []
native-tls = []
23 changes: 23 additions & 0 deletions crates/deltalake-catalog-glue/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

.PHONY: help
help: ## Show this help
@egrep -h '\s##\s' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'

.PHONY: all build check test clean
all: check build test ## Perform all the checks builds and testing

check: ## Ensure that the crate meets the basic formatting and structure
cargo fmt --check
cargo clippy
cargo clippy --features native-tls --no-default-features

build: ## Build the crate with each set of features
cargo build
cargo build --features native-tls --no-default-features

test: ## Run the crate's tests with each set of features
cargo test
cargo test --features native-tls --no-default-features

clean: ## Clean up resources from build
cargo clean
20 changes: 20 additions & 0 deletions crates/deltalake-catalog-glue/examples/demo.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use deltalake_catalog_glue::*;
use deltalake_core::*;

#[tokio::main]
async fn main() {
println!("Reading a table");

let catalog = GlueDataCatalog::from_env()
.await
.expect("Failed to load catalog from the environment");
println!("catalog: {catalog:?}");

println!(
"read: {:?}",
catalog
.get_table_storage_location(None, "database", "table")
.await
.expect("Failed")
);
}
115 changes: 115 additions & 0 deletions crates/deltalake-catalog-glue/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
//! Glue Data Catalog.
//!
use aws_config::SdkConfig;
use deltalake_core::data_catalog::{DataCatalog, DataCatalogError};
use log::*;

#[derive(thiserror::Error, Debug)]
pub enum GlueError {
/// Missing metadata in the catalog
#[error("Missing Metadata {metadata} in the Data Catalog ")]
MissingMetadata {
/// The missing metadata property
metadata: String,
},

/// Error calling the AWS SDK
#[error("Failed in an AWS SDK call")]
AWSError {
#[from]
source: aws_sdk_glue::Error,
},
}

impl From<GlueError> for DataCatalogError {
fn from(val: GlueError) -> Self {
DataCatalogError::Generic {
catalog: "glue",
source: Box::new(val),
}
}
}

/// A Glue Data Catalog implement of the `Catalog` trait
pub struct GlueDataCatalog {
client: aws_sdk_glue::Client,
}

impl GlueDataCatalog {
/// Creates a new GlueDataCatalog with environmental configuration
pub async fn from_env() -> Result<Self, GlueError> {
let config = aws_config::load_from_env().await;
let client = aws_sdk_glue::Client::new(&config);
Ok(Self { client })
}

/// Create a new [GlueDataCatalog] with the given [aws_config::SdkConfig]
pub fn with_config(config: &SdkConfig) -> Self {
let client = aws_sdk_glue::Client::new(config);
Self { client }
}
}

impl std::fmt::Debug for GlueDataCatalog {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(fmt, "GlueDataCatalog")
}
}

// Placeholder suffix created by Spark in the Glue Data Catalog Location
const PLACEHOLDER_SUFFIX: &str = "-__PLACEHOLDER__";
rtyler marked this conversation as resolved.
Show resolved Hide resolved

#[async_trait::async_trait]
impl DataCatalog for GlueDataCatalog {
/// Get the table storage location from the Glue Data Catalog
async fn get_table_storage_location(
&self,
catalog_id: Option<String>,
database_name: &str,
table_name: &str,
) -> Result<String, DataCatalogError> {
let mut builder = self
.client
.get_table()
.database_name(database_name)
.name(table_name);

if let Some(catalog) = catalog_id {
builder = builder.catalog_id(catalog);
}

let response = builder
.send()
.await
.map_err(|e| GlueError::AWSError { source: e.into() })
.map_err(<GlueError as Into<DataCatalogError>>::into)?;

let location = response
.table
.ok_or(GlueError::MissingMetadata {
metadata: "Table".to_string(),
})
.map_err(<GlueError as Into<DataCatalogError>>::into)?
.storage_descriptor
.ok_or(GlueError::MissingMetadata {
metadata: "Storage Descriptor".to_string(),
})
.map_err(<GlueError as Into<DataCatalogError>>::into)?
.location
.map(|l| l.replace("s3a", "s3"))
.ok_or(GlueError::MissingMetadata {
metadata: "Location".to_string(),
});

match location {
Ok(location) => {
if location.ends_with(PLACEHOLDER_SUFFIX) {
Ok(location[..location.len() - PLACEHOLDER_SUFFIX.len()].to_string())
} else {
Ok(location)
}
}
Err(err) => Err(err.into()),
}
}
}
6 changes: 1 addition & 5 deletions crates/deltalake-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ edition = "2021"
[package.metadata.docs.rs]
# We cannot use all_features because TLS features are mutually exclusive.
# We cannot use hdfs feature because it requires Java to be installed.
features = ["azure", "datafusion", "gcs", "glue", "hdfs", "json", "python", "s3", "unity-experimental"]
features = ["azure", "datafusion", "gcs", "hdfs", "json", "python", "s3", "unity-experimental"]

[dependencies]
# arrow
Expand Down Expand Up @@ -101,8 +101,6 @@ rusoto_credential = { version = "0.47", optional = true }
rusoto_sts = { version = "0.47", default-features = false, optional = true }
deltalake-aws = { path = "../deltalake-aws", default-features = false, optional = true }

# Glue
rusoto_glue = { version = "0.47", default-features = false, optional = true }

# Unity
reqwest = { version = "0.11.18", default-features = false, features = [
Expand Down Expand Up @@ -162,8 +160,6 @@ datafusion = [
]
datafusion-ext = ["datafusion"]
gcs = ["object_store/gcp"]
glue = ["s3", "rusoto_glue/rustls", "tracing", "hyper"]
glue-native-tls = ["s3-native-tls", "rusoto_glue", "tracing", "hyper"]
hdfs = ["datafusion-objectstore-hdfs"]
# used only for integration testing
integration_test = ["fs_extra", "tempdir"]
Expand Down
1 change: 0 additions & 1 deletion crates/deltalake-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ cargo run --example read_delta_table
- `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
- `datafusion-ext` - DEPRECATED: alias for `datafusion` feature
- `gcs` - enable the Google storage backend to work with Delta Tables in Google Cloud Storage.
- `glue` - enable the Glue data catalog to work with Delta Tables with AWS Glue.
- `hdfs` - enable the HDFS storage backend to work with Delta Tables in HDFS.
- `json` - enable the JSON feature of the `parquet` crate for better JSON interoperability.
- `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`.
Expand Down
110 changes: 0 additions & 110 deletions crates/deltalake-core/src/data_catalog/glue/mod.rs

This file was deleted.

38 changes: 0 additions & 38 deletions crates/deltalake-core/src/data_catalog/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ pub use unity::*;

#[cfg(feature = "unity-experimental")]
pub mod client;
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
pub mod glue;
#[cfg(feature = "datafusion")]
pub mod storage;
#[cfg(feature = "unity-experimental")]
Expand All @@ -25,7 +23,6 @@ pub enum DataCatalogError {
Generic {
/// Name of the catalog
catalog: &'static str,

/// Error message
source: Box<dyn std::error::Error + Send + Sync + 'static>,
},
Expand All @@ -48,41 +45,6 @@ pub enum DataCatalogError {
source: reqwest::Error,
},

/// Missing metadata in the catalog
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Missing Metadata {metadata} in the Data Catalog ")]
MissingMetadata {
/// The missing metadata property
metadata: String,
},

/// Glue Glue Data Catalog Error
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Catalog glue error: {source}")]
GlueError {
/// The underlying Glue Data Catalog Error
#[from]
source: rusoto_core::RusotoError<rusoto_glue::GetTableError>,
},

/// Error caused by the http request dispatcher not being able to be created.
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Failed to create request dispatcher: {source}")]
AWSHttpClient {
/// The underlying Rusoto TlsError
#[from]
source: rusoto_core::request::TlsError,
},

/// Error representing a failure to retrieve AWS credentials.
#[cfg(any(feature = "glue", feature = "glue-native-tls"))]
#[error("Failed to retrieve AWS credentials: {source}")]
AWSCredentials {
/// The underlying Rusoto CredentialsError
#[from]
source: rusoto_credential::CredentialsError,
},

/// Error caused by missing environment variable for Unity Catalog.
#[cfg(feature = "unity-experimental")]
#[error("Missing Unity Catalog environment variable: {var_name}")]
Expand Down
1 change: 0 additions & 1 deletion crates/deltalake-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
//! - `s3`, `gcs`, `azure` - enable the storage backends for AWS S3, Google Cloud Storage (GCS),
//! or Azure Blob Storage / Azure Data Lake Storage Gen2 (ADLS2). Use `s3-native-tls` to use native TLS
//! instead of Rust TLS implementation.
//! - `glue` - enable the Glue data catalog to work with Delta Tables with AWS Glue.
//! - `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation
//! for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
//! - `datafusion-ext` - DEPRECATED: alias for `datafusion` feature.
Expand Down
Loading
Loading