Skip to content

Commit

Permalink
Remove the parquet2 feature and dependency
Browse files Browse the repository at this point in the history
IMHO this is easier to just do now rather than wasting time on fixing
the build

Fixes #2004
  • Loading branch information
rtyler committed Dec 30, 2023
1 parent 6476de7 commit 6f92689
Show file tree
Hide file tree
Showing 22 changed files with 35 additions and 2,044 deletions.
22 changes: 0 additions & 22 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,25 +144,3 @@ jobs:
- name: Run tests with native-tls
run: |
cargo test --no-default-features --features integration_test,s3-native-tls,datafusion
parquet2_test:
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-C debuginfo=line-tables-only"
CARGO_INCREMENTAL: 0

steps:
- uses: actions/checkout@v3

- name: Install minimal stable with clippy and rustfmt
uses: actions-rs/toolchain@v1
with:
profile: default
toolchain: stable
override: true

- uses: Swatinem/rust-cache@v2

- name: Run tests
working-directory: crates/deltalake-core
run: cargo test --no-default-features --features=parquet2
27 changes: 0 additions & 27 deletions crates/deltalake-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,26 +81,13 @@ num-traits = "0.2.15"
object_store = "0.7"
once_cell = "1.16.0"
parking_lot = "0.12"
parquet2 = { version = "0.17", optional = true }
percent-encoding = "2"
roaring = "0.10.1"
tracing = { version = "0.1", optional = true }
rand = "0.8"
z85 = "3.0.5"
maplit = "1"

# hdfs
datafusion-objectstore-hdfs = { version = "0.1.3", default-features = false, features = [
"hdfs3",
"try_spawn_blocking",
], optional = true }

# S3 lock client
rusoto_core = { version = "0.47", default-features = false, optional = true }
rusoto_credential = { version = "0.47", optional = true }
rusoto_sts = { version = "0.47", default-features = false, optional = true }


# Unity
reqwest = { version = "0.11.18", default-features = false, features = [
"rustls-tls",
Expand Down Expand Up @@ -147,7 +134,6 @@ arrow = [
]
default = ["arrow", "parquet"]
datafusion = [
"dep:arrow",
"dep:datafusion",
"datafusion-expr",
"datafusion-common",
Expand All @@ -160,23 +146,10 @@ datafusion = [
]
datafusion-ext = ["datafusion"]
gcs = ["object_store/gcp"]
hdfs = ["datafusion-objectstore-hdfs"]
# used only for integration testing
integration_test = ["fs_extra", "tempdir"]
json = ["parquet/json"]
python = ["arrow/pyarrow"]
s3-native-tls = [
"rusoto_core/native-tls",
"rusoto_credential",
"rusoto_sts/native-tls",
"object_store/aws",
]
s3 = [
"rusoto_core/rustls",
"rusoto_credential",
"rusoto_sts/rustls",
"object_store/aws",
]
unity-experimental = ["reqwest", "tracing", "hyper"]

[[bench]]
Expand Down
4 changes: 0 additions & 4 deletions crates/deltalake-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,7 @@ cargo run --example read_delta_table
- `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
- `datafusion-ext` - DEPRECATED: alias for `datafusion` feature
- `gcs` - enable the Google storage backend to work with Delta Tables in Google Cloud Storage.
- `hdfs` - enable the HDFS storage backend to work with Delta Tables in HDFS.
- `json` - enable the JSON feature of the `parquet` crate for better JSON interoperability.
- `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`.
- `s3` - enable the S3 storage backend to work with Delta Tables in AWS S3. Uses [rustls](https://github.com/ctz/rustls).
- `s3-native-tls` - enable the S3 storage backend but rely on OpenSSL.

## Development

Expand Down
7 changes: 2 additions & 5 deletions crates/deltalake-core/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,13 @@ pub enum DeltaTableError {
},

/// Error returned when parsing checkpoint parquet.
#[cfg(any(feature = "parquet", feature = "parquet2"))]
#[cfg(feature = "parquet")]
#[error("Failed to parse parquet: {}", .source)]
Parquet {
/// Parquet error details returned when reading the checkpoint failed.
#[cfg(feature = "parquet")]
#[from]
source: parquet::errors::ParquetError,
/// Parquet error details returned when reading the checkpoint failed.
#[cfg(feature = "parquet2")]
#[from]
source: parquet2::error::Error,
},

/// Error returned when converting the schema in Arrow format failed.
Expand Down Expand Up @@ -231,6 +227,7 @@ impl From<ProtocolError> for DeltaTableError {
ProtocolError::Arrow { source } => DeltaTableError::Arrow { source },
ProtocolError::IO { source } => DeltaTableError::Io { source },
ProtocolError::ObjectStore { source } => DeltaTableError::ObjectStore { source },
#[cfg(feature = "parquet")]
ProtocolError::ParquetParseError { source } => DeltaTableError::Parquet { source },
_ => DeltaTableError::Protocol { source: value },
}
Expand Down
12 changes: 2 additions & 10 deletions crates/deltalake-core/src/kernel/actions/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ pub enum ReaderFeatures {
Other(String),
}

#[cfg(all(not(feature = "parquet2"), feature = "parquet"))]
#[cfg(feature = "parquet")]
impl From<&parquet::record::Field> for ReaderFeatures {
fn from(value: &parquet::record::Field) -> Self {
match value {
Expand Down Expand Up @@ -330,7 +330,7 @@ impl fmt::Display for WriterFeatures {
}
}

#[cfg(all(not(feature = "parquet2"), feature = "parquet"))]
#[cfg(feature = "parquet")]
impl From<&parquet::record::Field> for WriterFeatures {
fn from(value: &parquet::record::Field) -> Self {
match value {
Expand Down Expand Up @@ -599,10 +599,6 @@ pub struct Add {
#[cfg(feature = "parquet")]
#[serde(skip_serializing, skip_deserializing)]
pub partition_values_parsed: Option<parquet::record::Row>,
/// Partition values parsed for parquet2
#[cfg(feature = "parquet2")]
#[serde(skip_serializing, skip_deserializing)]
pub partition_values_parsed: Option<String>,

/// Contains statistics (e.g., count, min/max values for columns) about the data in this file in
/// raw parquet format. This field needs to be written when statistics are available and the
Expand All @@ -612,10 +608,6 @@ pub struct Add {
#[cfg(feature = "parquet")]
#[serde(skip_serializing, skip_deserializing)]
pub stats_parsed: Option<parquet::record::Row>,
/// Stats parsed for parquet2
#[cfg(feature = "parquet2")]
#[serde(skip_serializing, skip_deserializing)]
pub stats_parsed: Option<String>,
}

impl Add {
Expand Down
15 changes: 0 additions & 15 deletions crates/deltalake-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@
//! - `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation
//! for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
//! - `datafusion-ext` - DEPRECATED: alias for `datafusion` feature.
//! - `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features
//! are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`.
//!
//! # Querying Delta Tables with Datafusion
//!
Expand All @@ -71,16 +69,6 @@
#![allow(rustdoc::invalid_html_tags)]
#![allow(clippy::nonminimal_bool)]

#[cfg(all(feature = "parquet", feature = "parquet2"))]
compile_error!(
"Features parquet and parquet2 are mutually exclusive and cannot be enabled together"
);

#[cfg(all(feature = "s3", feature = "s3-native-tls"))]
compile_error!(
"Features s3 and s3-native-tls are mutually exclusive and cannot be enabled together"
);

#[cfg(all(feature = "glue", feature = "glue-native-tls"))]
compile_error!(
"Features glue and glue-native-tls are mutually exclusive and cannot be enabled together"
Expand Down Expand Up @@ -122,8 +110,6 @@ pub use arrow;
pub use datafusion;
#[cfg(feature = "parquet")]
pub use parquet;
#[cfg(feature = "parquet2")]
pub use parquet2;
#[cfg(all(feature = "arrow", feature = "parquet"))]
pub use protocol::checkpoints;

Expand Down Expand Up @@ -529,7 +515,6 @@ mod tests {
);
}

// TODO: enable this for parquet2
#[cfg(feature = "parquet")]
#[tokio::test]
async fn read_delta_1_2_1_struct_stats_table() {
Expand Down
18 changes: 3 additions & 15 deletions crates/deltalake-core/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

#[cfg(all(feature = "arrow", feature = "parquet"))]
pub mod checkpoints;
#[cfg(feature = "parquet2")]
pub mod parquet2_read;
#[cfg(feature = "parquet")]
mod parquet_read;
mod time_utils;
Expand Down Expand Up @@ -60,14 +58,10 @@ pub enum ProtocolError {
#[error("Generic action error: {0}")]
Generic(String),

#[cfg(any(feature = "parquet", feature = "parquet2"))]
#[cfg(feature = "parquet")]
/// Error returned when parsing checkpoint parquet using the parquet crate.
#[error("Failed to parse parquet checkpoint: {source}")]
ParquetParseError {
/// Parquet error details returned when parsing the checkpoint parquet
#[cfg(feature = "parquet2")]
#[from]
source: parquet2::error::Error,
/// Parquet error details returned when parsing the checkpoint parquet
#[cfg(feature = "parquet")]
#[from]
Expand Down Expand Up @@ -235,16 +229,10 @@ pub struct StatsParsed {
/// Contains a value smaller than all values present in the file for all columns.
#[cfg(feature = "parquet")]
pub min_values: HashMap<String, parquet::record::Field>,
/// Contains a value smaller than all values present in the file for all columns.
#[cfg(feature = "parquet2")]
pub min_values: HashMap<String, String>,
/// Contains a value larger than all values present in the file for all columns.
#[cfg(feature = "parquet")]
/// Contains a value larger than all values present in the file for all columns.
pub max_values: HashMap<String, parquet::record::Field>,
#[cfg(feature = "parquet2")]
/// Contains a value larger than all values present in the file for all columns.
pub max_values: HashMap<String, String>,
/// The number of null values for all columns.
pub null_count: HashMap<String, i64>,
}
Expand Down Expand Up @@ -272,7 +260,7 @@ impl Eq for Add {}

impl Add {
/// Get whatever stats are available. Uses (parquet struct) parsed_stats if present falling back to json stats.
#[cfg(any(feature = "parquet", feature = "parquet2"))]
#[cfg(feature = "parquet")]
pub fn get_stats(&self) -> Result<Option<Stats>, serde_json::error::Error> {
match self.get_stats_parsed() {
Ok(Some(stats)) => Ok(Some(stats)),
Expand All @@ -288,7 +276,7 @@ impl Add {
}

/// Get whatever stats are available.
#[cfg(not(any(feature = "parquet", feature = "parquet2")))]
#[cfg(not(any(feature = "parquet")))]
pub fn get_stats(&self) -> Result<Option<Stats>, serde_json::error::Error> {
self.get_json_stats()
}
Expand Down
76 changes: 0 additions & 76 deletions crates/deltalake-core/src/protocol/parquet2_read/boolean.rs

This file was deleted.

This file was deleted.

This file was deleted.

Loading

0 comments on commit 6f92689

Please sign in to comment.