From ba1b563170c32ed93e66974bfd752179b6ab79f7 Mon Sep 17 00:00:00 2001 From: sigorbor <34869353+sigorbor@users.noreply.github.com> Date: Tue, 16 Jan 2024 19:10:41 +0200 Subject: [PATCH] fix: properly deserialize percent-encoded file paths of Remove actions, to make sure tombstone and file paths match (#2035) # Description Percent-encoded file paths of Remove actions were not properly deserialized, and when compared to active file paths, the paths didn't match, which caused tombstones to be recognized as active files (be kept in the state) # Related Issue(s) # Documentation --------- Co-authored-by: Igor Borodin Co-authored-by: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Co-authored-by: R. Tyler Croy --- .../src/kernel/actions/types.rs | 1 + crates/deltalake-core/src/table/state.rs | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/crates/deltalake-core/src/kernel/actions/types.rs b/crates/deltalake-core/src/kernel/actions/types.rs index 3dc177fb5b..f9a2eb9909 100644 --- a/crates/deltalake-core/src/kernel/actions/types.rs +++ b/crates/deltalake-core/src/kernel/actions/types.rs @@ -632,6 +632,7 @@ pub struct Remove { /// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path. /// /// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt + #[serde(with = "serde_path")] pub path: String, /// When `false` the logical file must already be present in the table or the records diff --git a/crates/deltalake-core/src/table/state.rs b/crates/deltalake-core/src/table/state.rs index 03483ef8ff..061abac6af 100644 --- a/crates/deltalake-core/src/table/state.rs +++ b/crates/deltalake-core/src/table/state.rs @@ -363,9 +363,11 @@ impl DeltaTableState { #[cfg(test)] mod tests { + use super::*; use crate::kernel::Txn; use pretty_assertions::assert_eq; + use serde_json::json; #[test] fn state_round_trip() { @@ -414,4 +416,34 @@ mod tests { assert_eq!(2, *state.app_transaction_version().get("abc").unwrap()); assert_eq!(1, *state.app_transaction_version().get("xyz").unwrap()); } + + #[test] + fn test_merging_deserialized_special_tombstones_and_files_paths() { + let add = serde_json::from_value(json!({ + "path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet", + "partitionValues": {"x": "A/A"}, + "size": 460, + "modificationTime": 1631873480, + "dataChange": true + })) + .unwrap(); + + let remove = serde_json::from_value(json!({ + "path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet", + "deletionTimestamp": 1631873481, + "partitionValues": {"x": "A/A"}, + "size": 460, + "modificationTime": 1631873481, + "dataChange": true + })) + .unwrap(); + + let state = DeltaTableState::from_actions(vec![Action::Add(add)], 0).unwrap(); + let state_next = DeltaTableState::from_actions(vec![Action::Remove(remove)], 1).unwrap(); + + let mut merged_state = state.clone(); + merged_state.merge(state_next, true, true); + + assert_eq!(merged_state.files().len(), 0); + } }