Skip to content

Commit

Permalink
Merge branch 'main' into chore--allow-old-casting-behavior-until-depr…
Browse files Browse the repository at this point in the history
…ecation
  • Loading branch information
rtyler authored Sep 12, 2024
2 parents 0bbd20a + 8ba4fe0 commit eb6d42a
Show file tree
Hide file tree
Showing 28 changed files with 221 additions and 98 deletions.
17 changes: 17 additions & 0 deletions .github/codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

coverage:
status:
project:
default:
# allow some leniency on the deviation of pull requests
threshold: '1%'
informational: true
patch:
default:
informational: true


ignore:
- "delta-inspect/"
- "proofs/"
- "**/*.toml"
3 changes: 1 addition & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,5 @@ updates:
ignore:
# arrow and datafusion are bumped manually
- dependency-name: "arrow*"
update-types: ["version-update:semver-major"]
- dependency-name: "datafusion*"
update-types: ["version-update:semver-major"]
- dependency-name: "parquet"
25 changes: 0 additions & 25 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,6 @@ jobs:
- name: Format
run: cargo fmt -- --check

coverage:
runs-on: ubuntu-latest
env:
CARGO_TERM_COLOR: always
steps:
- uses: actions/checkout@v4
- name: Install rust
uses: actions-rs/toolchain@v1
with:
profile: default
toolchain: '1.80'
override: true
- name: Install cargo-llvm-cov
uses: taiki-e/install-action@cargo-llvm-cov
- uses: Swatinem/rust-cache@v2
- name: Generate code coverage
run: cargo llvm-cov --features ${DEFAULT_FEATURES} --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
files: codecov.json
fail_ci_if_error: true
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

build:
strategy:
fail-fast: false
Expand Down
36 changes: 36 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: coverage

on:
push:
branches: [main, "rust-v*"]
pull_request:
branches: [main, "rust-v*"]

env:
DEFAULT_FEATURES: "azure,datafusion,s3,gcs,glue,hdfs "

jobs:
coverage:
runs-on: ubuntu-latest
env:
CARGO_TERM_COLOR: always
steps:
- uses: actions/checkout@v4
- name: Install rust
uses: actions-rs/toolchain@v1
with:
profile: default
toolchain: '1.80'
override: true
- name: Install cargo-llvm-cov
uses: taiki-e/install-action@cargo-llvm-cov
- uses: Swatinem/rust-cache@v2
- name: Generate code coverage
run: cargo llvm-cov --features ${DEFAULT_FEATURES} --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
files: codecov.json
fail_ci_if_error: true
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
54 changes: 54 additions & 0 deletions .github/workflows/python_benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: python_benchmark


# This is separate from the python_build so that it doesn't need to run on the merge group
on:
push:
branches: [main]
pull_request:
branches: [main]

defaults:
run:
working-directory: ./python

jobs:
benchmark:
name: Python Benchmark
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-C debuginfo=line-tables-only"
CARGO_INCREMENTAL: 0

steps:
- uses: actions/checkout@v2

- name: Setup Environment
uses: ./.github/actions/setup-env

- name: Build deltalake in release mode
run: |
python -m venv venv
source venv/bin/activate
MATURIN_EXTRA_ARGS=--release make develop
# Download previous benchmark result from cache (if exists)
- name: Download previous benchmark data
uses: actions/cache@v2
with:
path: ./cache
key: ${{ runner.os }}-benchmark

- name: Run benchmark
run: |
source venv/bin/activate
pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json
- name: Store benchmark result
uses: benchmark-action/github-action-benchmark@v1
with:
tool: "pytest"
output-file-path: python/output.json
external-data-json-path: ./cache/benchmark-data.json
fail-on-alert: true

39 changes: 0 additions & 39 deletions .github/workflows/python_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,45 +97,6 @@ jobs:
python -m pytest -m "not pandas and not integration and not benchmark"
pip install pandas
benchmark:
name: Python Benchmark
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-C debuginfo=line-tables-only"
CARGO_INCREMENTAL: 0

steps:
- uses: actions/checkout@v2

- name: Setup Environment
uses: ./.github/actions/setup-env

- name: Build deltalake in release mode
run: |
python -m venv venv
source venv/bin/activate
MATURIN_EXTRA_ARGS=--release make develop
# Download previous benchmark result from cache (if exists)
- name: Download previous benchmark data
uses: actions/cache@v2
with:
path: ./cache
key: ${{ runner.os }}-benchmark

- name: Run benchmark
run: |
source venv/bin/activate
pytest tests/test_benchmark.py -m benchmark --benchmark-json output.json
- name: Store benchmark result
uses: benchmark-action/github-action-benchmark@v1
with:
tool: "pytest"
output-file-path: python/output.json
external-data-json-path: ./cache/benchmark-data.json
fail-on-alert: true

test-pyspark:
name: PySpark Integration Tests
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ debug = true
debug = "line-tables-only"

[workspace.dependencies]
delta_kernel = { version = "0.3.0" }
delta_kernel = { version = "=0.3.0" }
# delta_kernel = { path = "../delta-kernel-rs/kernel", version = "0.3.0" }

# arrow
Expand Down
4 changes: 2 additions & 2 deletions crates/aws/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-aws"
version = "0.1.4"
version = "0.2.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
aws-smithy-runtime-api = { version="1.1.7" }
aws-smithy-runtime = { version="1.1.7", optional = true}
aws-credential-types = { version="1.1.7", features = ["hardcoded-credentials"]}
Expand Down
4 changes: 2 additions & 2 deletions crates/azure/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-azure"
version = "0.1.4"
version = "0.2.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
lazy_static = "1"

# workspace depenndecies
Expand Down
6 changes: 2 additions & 4 deletions crates/catalog-glue/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-catalog-glue"
version = "0.2.0"
version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -15,9 +15,7 @@ rust-version.workspace = true
async-trait = { workspace = true }
aws-config = "1"
aws-sdk-glue = "1"
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
# This can depend on a lowest common denominator of core once that's released
# deltalake_core = { version = "0.17.0" }
deltalake-core = { version = "0.19.1", path = "../core" }
thiserror = { workspace = true }

[dev-dependencies]
Expand Down
5 changes: 3 additions & 2 deletions crates/core/src/kernel/snapshot/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult<Vec<Add
let size = ex::extract_and_cast::<Int64Array>(arr, "size")?;
let modification_time = ex::extract_and_cast::<Int64Array>(arr, "modificationTime")?;
let data_change = ex::extract_and_cast::<BooleanArray>(arr, "dataChange")?;
let stats = ex::extract_and_cast::<StringArray>(arr, "stats")?;
let stats = ex::extract_and_cast_opt::<StringArray>(arr, "stats");
let tags = ex::extract_and_cast_opt::<MapArray>(arr, "tags");
let dv = ex::extract_and_cast_opt::<StructArray>(arr, "deletionVector");

Expand Down Expand Up @@ -126,7 +126,8 @@ pub(super) fn read_adds(array: &dyn ProvidesColumnByName) -> DeltaResult<Vec<Add
size: ex::read_primitive(size, i)?,
modification_time: ex::read_primitive(modification_time, i)?,
data_change: ex::read_bool(data_change, i)?,
stats: ex::read_str_opt(stats, i).map(|s| s.to_string()),
stats: stats
.and_then(|stats| ex::read_str_opt(stats, i).map(|s| s.to_string())),
partition_values: pvs
.and_then(|pv| collect_map(&pv.value(i)).map(|m| m.collect()))
.unwrap_or_default(),
Expand Down
2 changes: 1 addition & 1 deletion crates/core/src/operations/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ async fn execute_non_empty_expr(
let input_dfschema: DFSchema = df_schema.as_ref().clone().try_into()?;

let scan_config = DeltaScanConfigBuilder::new()
.with_schema(df_schema)
.with_schema(snapshot.input_schema()?)
.build(snapshot)?;

let scan = DeltaScanBuilder::new(snapshot, log_store.clone(), &state)
Expand Down
9 changes: 9 additions & 0 deletions crates/core/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1221,6 +1221,15 @@ mod tests {
assert_eq!(&expected_null_count, null_count_column);
}

#[tokio::test]
async fn test_table_checkpoint_not_always_with_stats() {
let path = "../test/tests/data/delta-checkpoint-stats-optional";
let mut table = crate::open_table(path).await.unwrap();
table.load().await.unwrap();

assert_eq!(2, table.snapshot().unwrap().file_actions().unwrap().len());
}

#[tokio::test]
async fn test_only_struct_stats() {
// test table with no json stats
Expand Down
22 changes: 22 additions & 0 deletions crates/core/src/writer/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::{collections::HashMap, ops::AddAssign};

use delta_kernel::expressions::Scalar;
use indexmap::IndexMap;
use itertools::Itertools;
use parquet::file::metadata::ParquetMetaData;
use parquet::format::FileMetaData;
use parquet::schema::types::{ColumnDescriptor, SchemaDescriptor};
Expand Down Expand Up @@ -130,8 +131,29 @@ fn stats_from_metadata(
let mut min_values: HashMap<String, ColumnValueStat> = HashMap::new();
let mut max_values: HashMap<String, ColumnValueStat> = HashMap::new();
let mut null_count: HashMap<String, ColumnCountStat> = HashMap::new();
let dialect = sqlparser::dialect::GenericDialect {};

let idx_to_iterate = if let Some(stats_cols) = stats_columns {
let stats_cols = stats_cols
.into_iter()
.map(|v| {
match sqlparser::parser::Parser::new(&dialect)
.try_with_sql(v)
.map_err(|e| DeltaTableError::generic(e.to_string()))?
.parse_multipart_identifier()
{
Ok(parts) => Ok(parts.into_iter().map(|v| v.value).join(".")),
Err(e) => {
return Err(DeltaWriterError::DeltaTable(
DeltaTableError::GenericError {
source: Box::new(e),
},
))
}
}
})
.collect::<Result<Vec<String>, DeltaWriterError>>()?;

schema_descriptor
.columns()
.iter()
Expand Down
12 changes: 6 additions & 6 deletions crates/deltalake/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ rust-version.workspace = true
features = ["azure", "datafusion", "gcs", "hdfs", "json", "python", "s3", "unity-experimental"]

[dependencies]
deltalake-core = { version = "0.19.0", path = "../core" }
deltalake-aws = { version = "0.1.4", path = "../aws", default-features = false, optional = true }
deltalake-azure = { version = "0.1.4", path = "../azure", optional = true }
deltalake-gcp = { version = "0.2.2", path = "../gcp", optional = true }
deltalake-hdfs = { version = "0.2.0", path = "../hdfs", optional = true }
deltalake-catalog-glue = { version = "0.2.0", path = "../catalog-glue", optional = true }
deltalake-core = { version = "0.19.1", path = "../core" }
deltalake-aws = { version = "0.2.0", path = "../aws", default-features = false, optional = true }
deltalake-azure = { version = "0.2.0", path = "../azure", optional = true }
deltalake-gcp = { version = "0.3.0", path = "../gcp", optional = true }
deltalake-hdfs = { version = "0.3.0", path = "../hdfs", optional = true }
deltalake-catalog-glue = { version = "0.3.0", path = "../catalog-glue", optional = true }

[features]
# All of these features are just reflected into the core crate until that
Expand Down
4 changes: 2 additions & 2 deletions crates/gcp/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-gcp"
version = "0.2.3"
version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
lazy_static = "1"

# workspace depenndecies
Expand Down
4 changes: 2 additions & 2 deletions crates/hdfs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-hdfs"
version = "0.2.0"
version = "0.3.0"
authors.workspace = true
keywords.workspace = true
readme.workspace = true
Expand All @@ -12,7 +12,7 @@ repository.workspace = true
rust-version.workspace = true

[dependencies]
deltalake-core = { version = ">=0.17.0, <0.20.0", path = "../core" }
deltalake-core = { version = "0.19.1", path = "../core" }
hdfs-native-object-store = "0.11"

# workspace dependecies
Expand Down
Loading

0 comments on commit eb6d42a

Please sign in to comment.