Skip to content

Commit

Permalink
fix: properly deserialize percent-encoded file paths of Remove action…
Browse files Browse the repository at this point in the history
…s, to make sure tombstone and file paths match (delta-io#2035)

Percent-encoded file paths of Remove actions were not properly
deserialized, and when compared to active file paths, the paths didn't
match, which caused tombstones to be recognized as active files (be kept
in the state)

<!---
For example:

- closes delta-io#106
--->

<!---
Share links to useful documentation
--->

---------

Co-authored-by: Igor Borodin <[email protected]>
Co-authored-by: Ion Koutsouris <[email protected]>
Co-authored-by: R. Tyler Croy <[email protected]>
  • Loading branch information
4 people authored and natinimni committed Jan 31, 2024
1 parent 36d7249 commit b40ccf9
Show file tree
Hide file tree
Showing 78 changed files with 106 additions and 0 deletions.
32 changes: 32 additions & 0 deletions rust/src/table/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,10 @@ impl DeltaTableState {

#[cfg(test)]
mod tests {

use super::*;
use pretty_assertions::assert_eq;
use serde_json::json;

#[test]
fn state_round_trip() {
Expand Down Expand Up @@ -489,4 +491,34 @@ mod tests {
assert_eq!(2, *state.app_transaction_version().get("abc").unwrap());
assert_eq!(1, *state.app_transaction_version().get("xyz").unwrap());
}

#[test]
fn test_merging_deserialized_special_tombstones_and_files_paths() {
let add = serde_json::from_value(json!({
"path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet",
"partitionValues": {"x": "A/A"},
"size": 460,
"modificationTime": 1631873480,
"dataChange": true
}))
.unwrap();

let remove = serde_json::from_value(json!({
"path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet",
"deletionTimestamp": 1631873481,
"partitionValues": {"x": "A/A"},
"size": 460,
"modificationTime": 1631873481,
"dataChange": true
}))
.unwrap();

let state = DeltaTableState::from_actions(vec![Action::add(add)], 0).unwrap();
let state_next = DeltaTableState::from_actions(vec![Action::remove(remove)], 1).unwrap();

let mut merged_state = state.clone();
merged_state.merge(state_next, true, true);

assert_eq!(merged_state.files().len(), 0);
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isBlindAppend":true}}
{"protocol":{"minReaderVersion":5,"minWriterVersion":7,"readerFeatures": ["columnMapping","blahabl","deletionVectors","timestampNtz","v2Checkpoint"],"writerFeatures": ["appendOnly","invariants", "checkConstraints", "changeDataFeed","generatedColumns","columnMapping","identityColumns","deletionVectors","rowTracking","timestampNtz","domainMetadata","v2Checkpoint","icebergCompatV1"]}}
{"metaData":{"id":"5fba94ed-9794-4965-ba6e-6ee3c0d22af9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1587968585495}}
{"add":{"path":"part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet","partitionValues":{},"size":262,"modificationTime":1587968586000,"dataChange":true}}
{"add":{"path":"part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968586000,"dataChange":true}}
{"add":{"path":"part-00003-508ae4aa-801c-4c2c-a923-f6f89930a5c1-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968586000,"dataChange":true}}
{"add":{"path":"part-00004-80938522-09c0-420c-861f-5a649e3d9674-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968586000,"dataChange":true}}
{"add":{"path":"part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968586000,"dataChange":true}}
{"add":{"path":"part-00007-94f725e2-3963-4b00-9e83-e31021a93cf9-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968586000,"dataChange":true}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{"commitInfo":{"timestamp":1587968596254,"operation":"MERGE","operationParameters":{"predicate":"(oldData.`id` = newData.`id`)"},"readVersion":0,"isBlindAppend":false}}
{"remove":{"path":"part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet","deletionTimestamp":1587968596250,"dataChange":true}}
{"remove":{"path":"part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet","deletionTimestamp":1587968596253,"dataChange":true}}
{"remove":{"path":"part-00007-94f725e2-3963-4b00-9e83-e31021a93cf9-c000.snappy.parquet","deletionTimestamp":1587968596253,"dataChange":true}}
{"remove":{"path":"part-00003-508ae4aa-801c-4c2c-a923-f6f89930a5c1-c000.snappy.parquet","deletionTimestamp":1587968596253,"dataChange":true}}
{"remove":{"path":"part-00004-80938522-09c0-420c-861f-5a649e3d9674-c000.snappy.parquet","deletionTimestamp":1587968596253,"dataChange":true}}
{"add":{"path":"part-00000-a922ea3b-ffc2-4ca1-9074-a278c24c4449-c000.snappy.parquet","partitionValues":{},"size":262,"modificationTime":1587968595000,"dataChange":true}}
{"add":{"path":"part-00004-95c9bc2c-ac85-4581-b3cc-84502b0c314f-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00005-94a0861b-6455-4bd9-a080-73e02491c643-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00011-42f838f9-a911-40af-98f5-2fccfa1b123f-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00045-332fe409-7705-45b1-8d34-a0018cf73b70-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00049-d3095817-de74-49c1-a888-81565a40161d-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00058-b462c4cb-0c48-4148-8475-e21d2a2935f8-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00068-90650739-6a8e-492b-9403-53e33b3778ac-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00069-c78b4dd8-f955-4643-816f-cbd30a3f8c1b-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00077-2fcb1c7c-5390-48ee-93f6-0acf11199a0d-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00107-3f6c2aa0-fc28-4f4c-be15-135e15b398f4-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00112-07fd790a-11dc-4fde-9acd-623e740be992-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00116-bc66759e-6381-4f34-8cd4-6688aad8585d-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00121-d8bc3e53-d2f2-48ce-9909-78da7294ffbd-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00128-b31c3b81-24da-4a90-a8b4-578c9e9a218d-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00140-e9b1971d-d708-43fb-b07f-975d2226b800-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00143-03ceb88e-5283-4193-aa43-993cdf937fd3-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00150-ec6643fc-4963-4871-9613-f5ad1940b689-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00154-4630673a-5227-48fb-a03d-e356fcd1564a-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
{"add":{"path":"part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968596000,"dataChange":true}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{"commitInfo":{"timestamp":1587968604143,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":1,"isBlindAppend":false}}
{"add":{"path":"part-00000-c1777d7d-89d9-4790-b38a-6ee7e24456b1-c000.snappy.parquet","partitionValues":{},"size":262,"modificationTime":1587968602000,"dataChange":true}}
{"add":{"path":"part-00001-7891c33d-cedc-47c3-88a6-abcfb049d3b4-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968602000,"dataChange":true}}
{"add":{"path":"part-00003-53f42606-6cda-4f13-8d07-599a21197296-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968602000,"dataChange":true}}
{"add":{"path":"part-00004-315835fe-fb44-4562-98f6-5e6cfa3ae45d-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968602000,"dataChange":true}}
{"add":{"path":"part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968602000,"dataChange":true}}
{"add":{"path":"part-00007-3a0e4727-de0d-41b6-81ef-5223cf40f025-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968602000,"dataChange":true}}
{"remove":{"path":"part-00004-95c9bc2c-ac85-4581-b3cc-84502b0c314f-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00107-3f6c2aa0-fc28-4f4c-be15-135e15b398f4-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00011-42f838f9-a911-40af-98f5-2fccfa1b123f-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00190-8ac0ae67-fb1d-461d-a3d3-8dc112766ff5-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00121-d8bc3e53-d2f2-48ce-9909-78da7294ffbd-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00005-94a0861b-6455-4bd9-a080-73e02491c643-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00077-2fcb1c7c-5390-48ee-93f6-0acf11199a0d-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00112-07fd790a-11dc-4fde-9acd-623e740be992-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00128-b31c3b81-24da-4a90-a8b4-578c9e9a218d-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00000-a922ea3b-ffc2-4ca1-9074-a278c24c4449-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00068-90650739-6a8e-492b-9403-53e33b3778ac-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00164-bf40481c-4afd-4c02-befa-90f056c2d77a-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00045-332fe409-7705-45b1-8d34-a0018cf73b70-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00058-b462c4cb-0c48-4148-8475-e21d2a2935f8-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00140-e9b1971d-d708-43fb-b07f-975d2226b800-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00116-bc66759e-6381-4f34-8cd4-6688aad8585d-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00069-c78b4dd8-f955-4643-816f-cbd30a3f8c1b-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00154-4630673a-5227-48fb-a03d-e356fcd1564a-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00150-ec6643fc-4963-4871-9613-f5ad1940b689-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00049-d3095817-de74-49c1-a888-81565a40161d-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
{"remove":{"path":"part-00143-03ceb88e-5283-4193-aa43-993cdf937fd3-c000.snappy.parquet","deletionTimestamp":1587968604143,"dataChange":true}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"commitInfo":{"timestamp":1587968614187,"operation":"UPDATE","operationParameters":{"predicate":"((id#697L % cast(2 as bigint)) = cast(0 as bigint))"},"readVersion":2,"isBlindAppend":false}}
{"remove":{"path":"part-00003-53f42606-6cda-4f13-8d07-599a21197296-c000.snappy.parquet","deletionTimestamp":1587968614096,"dataChange":true}}
{"remove":{"path":"part-00006-46f2ff20-eb5d-4dda-8498-7bfb2940713b-c000.snappy.parquet","deletionTimestamp":1587968614096,"dataChange":true}}
{"add":{"path":"part-00000-f17fcbf5-e0dc-40ba-adae-ce66d1fcaef6-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968614000,"dataChange":true}}
{"add":{"path":"part-00001-bb70d2ba-c196-4df2-9c85-f34969ad3aa9-c000.snappy.parquet","partitionValues":{},"size":429,"modificationTime":1587968614000,"dataChange":true}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"commitInfo":{"timestamp":1587968626537,"operation":"DELETE","operationParameters":{"predicate":"[\"((`id` % CAST(2 AS BIGINT)) = CAST(0 AS BIGINT))\"]"},"readVersion":3,"isBlindAppend":false}}
{"remove":{"path":"part-00001-bb70d2ba-c196-4df2-9c85-f34969ad3aa9-c000.snappy.parquet","deletionTimestamp":1587968626536,"dataChange":true}}
{"remove":{"path":"part-00000-f17fcbf5-e0dc-40ba-adae-ce66d1fcaef6-c000.snappy.parquet","deletionTimestamp":1587968626536,"dataChange":true}}
{"add":{"path":"part-00000-2befed33-c358-4768-a43c-3eda0d2a499d-c000.snappy.parquet","partitionValues":{},"size":262,"modificationTime":1587968626000,"dataChange":true}}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit b40ccf9

Please sign in to comment.