Skip to content

Commit

Permalink
fix: properly deserialize percent-encoded file paths of Remove action…
Browse files Browse the repository at this point in the history
…s, to make sure tombstone and file paths match (#2035)

# Description
Percent-encoded file paths of Remove actions were not properly
deserialized, and when compared to active file paths, the paths didn't
match, which caused tombstones to be recognized as active files (be kept
in the state)

# Related Issue(s)
<!---
For example:

- closes #106
--->

# Documentation

<!---
Share links to useful documentation
--->

---------

Co-authored-by: Igor Borodin <[email protected]>
Co-authored-by: Ion Koutsouris <[email protected]>
Co-authored-by: R. Tyler Croy <[email protected]>
  • Loading branch information
4 people authored Jan 16, 2024
1 parent d65fc18 commit ba1b563
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 0 deletions.
1 change: 1 addition & 0 deletions crates/deltalake-core/src/kernel/actions/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ pub struct Remove {
/// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path.
///
/// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt
#[serde(with = "serde_path")]
pub path: String,

/// When `false` the logical file must already be present in the table or the records
Expand Down
32 changes: 32 additions & 0 deletions crates/deltalake-core/src/table/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -363,9 +363,11 @@ impl DeltaTableState {

#[cfg(test)]
mod tests {

use super::*;
use crate::kernel::Txn;
use pretty_assertions::assert_eq;
use serde_json::json;

#[test]
fn state_round_trip() {
Expand Down Expand Up @@ -414,4 +416,34 @@ mod tests {
assert_eq!(2, *state.app_transaction_version().get("abc").unwrap());
assert_eq!(1, *state.app_transaction_version().get("xyz").unwrap());
}

#[test]
fn test_merging_deserialized_special_tombstones_and_files_paths() {
let add = serde_json::from_value(json!({
"path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet",
"partitionValues": {"x": "A/A"},
"size": 460,
"modificationTime": 1631873480,
"dataChange": true
}))
.unwrap();

let remove = serde_json::from_value(json!({
"path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet",
"deletionTimestamp": 1631873481,
"partitionValues": {"x": "A/A"},
"size": 460,
"modificationTime": 1631873481,
"dataChange": true
}))
.unwrap();

let state = DeltaTableState::from_actions(vec![Action::Add(add)], 0).unwrap();
let state_next = DeltaTableState::from_actions(vec![Action::Remove(remove)], 1).unwrap();

let mut merged_state = state.clone();
merged_state.merge(state_next, true, true);

assert_eq!(merged_state.files().len(), 0);
}
}

0 comments on commit ba1b563

Please sign in to comment.