Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use physical name for column name lookup in partitions #1836

Merged
merged 37 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
189d604
try fix get_add_actions
aersam Nov 10, 2023
1d13a3a
fix order
aersam Nov 10, 2023
95f5748
Fixes https://github.com/delta-io/delta-rs/issues/1835
aersam Nov 10, 2023
a731f8d
cargo fmt
aersam Nov 10, 2023
9c06990
clippy? happy now?
aersam Nov 10, 2023
6989774
remove inline hint
aersam Nov 10, 2023
2a2a459
Merge branch 'main' of https://github.com/bmsuisse/delta-rs into fix_…
aersam Nov 13, 2023
b397a6c
Update crates/deltalake-core/src/kernel/schema.rs
aersam Nov 13, 2023
dd06417
Update crates/deltalake-core/src/table/state_arrow.rs
aersam Nov 13, 2023
6a0dbdb
finish switch &str
aersam Nov 13, 2023
ad4dbc7
add test files and sketch test_with_column_mapping
aersam Nov 13, 2023
6aa512e
try fix tests
aersam Nov 13, 2023
d30a3b5
better comment
aersam Nov 13, 2023
2ac9245
new attempt for tests
aersam Nov 13, 2023
da5c617
cargo fmt
aersam Nov 13, 2023
f79c7ca
make intention more clear about panic
aersam Nov 14, 2023
9731927
comment why we unwrap
aersam Nov 14, 2023
70ea38e
simplier test file
aersam Nov 14, 2023
034a797
well, late censorship
aersam Nov 14, 2023
03e4336
Merge branch 'main' into fix_state
aersam Nov 14, 2023
d20916d
Improve Error Handling
aersam Nov 15, 2023
e4ad56b
remove ColumnMappingMode, wrong location here
aersam Nov 15, 2023
00e37fd
add check for column_mapping_mode
aersam Nov 15, 2023
c81f920
comment check for column mapping mode
aersam Nov 15, 2023
bcf47f9
Only calculate HashMap if really needed
aersam Nov 15, 2023
bfb9232
Merge branch 'main' into fix_state
aersam Nov 15, 2023
8b91c64
Merge branch 'main' into fix_state
aersam Nov 15, 2023
858410c
Merge branch 'main' into fix_state
aersam Nov 15, 2023
6720f09
Merge branch 'main' into fix_state
aersam Nov 15, 2023
89dd543
Merge branch 'main' into fix_state
aersam Nov 16, 2023
c1b5720
Merge branch 'main' into fix_state
aersam Nov 16, 2023
d19a5a4
Merge branch 'main' into fix_state
aersam Nov 17, 2023
a851ef1
Merge branch 'main' into fix_state
aersam Nov 18, 2023
25734a0
Merge branch 'main' into fix_state
aersam Nov 20, 2023
c7b7152
move ColumnMappingMode to config.rs
aersam Nov 20, 2023
6bf1f10
Merge branch 'main' into fix_state
aersam Nov 21, 2023
4285037
Merge branch 'main' into fix_state
aersam Nov 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions crates/deltalake-core/src/kernel/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,16 @@ impl StructField {
self.nullable
}

/// Returns the phyiscal name
pub fn physical_name(&self) -> &str {
let phys_name = self.get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName);
match phys_name {
None => &self.name,
Some(MetadataValue::String(s)) => s,
Some(MetadataValue::Number(_)) => panic!("Unexpected type for physical name"), // Number makes no sense here and is against the spec
aersam marked this conversation as resolved.
Show resolved Hide resolved
}
}

#[inline]
/// Returns the data type of the column
pub const fn data_type(&self) -> &DataType {
Expand Down
74 changes: 74 additions & 0 deletions crates/deltalake-core/src/protocol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1082,6 +1082,80 @@ mod tests {
assert_eq!(expected, actions);
}

#[tokio::test]
async fn test_with_column_mapping() {
// test table with column mapping and partitions
let path = "./tests/data/table_with_column_mapping";
let table = crate::open_table(path).await.unwrap();
let actions = table.get_state().add_actions_table(true).unwrap();
let expected_columns: Vec<(&str, ArrayRef)> = vec![
(
"path",
Arc::new(array::StringArray::from(vec![
"BH/part-00000-4d6e745c-8e04-48d9-aa60-438228358f1a.c000.zstd.parquet",
"8v/part-00001-69b4a452-aeac-4ffa-bf5c-a0c2833d05eb.c000.zstd.parquet",
])),
),
(
"size_bytes",
Arc::new(array::Int64Array::from(vec![890, 810])),
),
(
"modification_time",
Arc::new(arrow::array::TimestampMillisecondArray::from(vec![
1699946088000,
1699946088000,
])),
),
(
"data_change",
Arc::new(array::BooleanArray::from(vec![true, true])),
),
(
"partition.Company Very Short",
Arc::new(array::StringArray::from(vec!["BMS", "BME"])),
),
("num_records", Arc::new(array::Int64Array::from(vec![4, 1]))),
(
"null_count.Company Very Short",
Arc::new(array::NullArray::new(2)),
),
("min.Company Very Short", Arc::new(array::NullArray::new(2))),
("max.Company Very Short", Arc::new(array::NullArray::new(2))),
("null_count.Super Name", Arc::new(array::NullArray::new(2))),
("min.Super Name", Arc::new(array::NullArray::new(2))),
("max.Super Name", Arc::new(array::NullArray::new(2))),
(
"tags.INSERTION_TIME",
Arc::new(array::StringArray::from(vec![
"1699946088000000",
"1699946088000001",
])),
),
(
"tags.MAX_INSERTION_TIME",
Arc::new(array::StringArray::from(vec![
"1699946088000000",
"1699946088000001",
])),
),
(
"tags.MIN_INSERTION_TIME",
Arc::new(array::StringArray::from(vec![
"1699946088000000",
"1699946088000001",
])),
),
(
"tags.OPTIMIZE_TARGET_SIZE",
Arc::new(array::StringArray::from(vec!["33554432", "33554432"])),
),
];
let expected = RecordBatch::try_from_iter(expected_columns.clone()).unwrap();

assert_eq!(expected, actions);
}

#[tokio::test]
async fn test_with_stats() {
// test table with stats
Expand Down
18 changes: 16 additions & 2 deletions crates/deltalake-core/src/table/state_arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,27 @@ impl DeltaTableState {
})
.collect::<HashMap<&str, _>>();

let physical_name_to_logical_name = metadata
.partition_columns
.iter()
.map(|name| {
let physical_name = metadata
.schema
.field_with_name(name) // Field must exist as it's mentioned in partition_columns. partition_columns are logical names
.unwrap()
aersam marked this conversation as resolved.
Show resolved Hide resolved
.physical_name();
(physical_name, name.as_str())
})
.collect::<HashMap<&str, &str>>();

// Append values
for action in self.files() {
for (name, maybe_value) in action.partition_values.iter() {
let logical_name = physical_name_to_logical_name.get(name.as_str()).unwrap();
if let Some(value) = maybe_value {
builders.get_mut(name.as_str()).unwrap().append_value(value);
builders.get_mut(logical_name).unwrap().append_value(value);
} else {
builders.get_mut(name.as_str()).unwrap().append_null();
builders.get_mut(logical_name).unwrap().append_null();
}
}
}
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"txnId":"0e8eece8-347f-4c77-bc4f-daf3a5985dc9","tableSizeBytes":1700,"numFiles":2,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"592de637-dd77-4aaa-af00-97d723a7f1f1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"Company Very Short\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.physicalName\":\"col-173b4db9-b5ad-427f-9e75-516aae37fbbb\"}},{\"name\":\"Super Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.physicalName\":\"col-3877fd94-0973-4941-ac6b-646849a1ff65\"}}]}","partitionColumns":["Company Very Short"],"configuration":{"delta.columnMapping.mode":"name","delta.autoOptimize.optimizeWrite":"true","delta.columnMapping.maxColumnId":"2","delta.targetFileSize":"33554432","delta.tuneFileSizesForRewrites":"true"},"createdTime":1699946083038},"protocol":{"minReaderVersion":2,"minWriterVersion":5},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[1700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"8v/part-00001-69b4a452-aeac-4ffa-bf5c-a0c2833d05eb.c000.zstd.parquet","partitionValues":{"col-173b4db9-b5ad-427f-9e75-516aae37fbbb":"BME"},"size":810,"modificationTime":1699946088000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Timothy Lamb\"},\"maxValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Timothy Lamb\"},\"nullCount\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":0}}","tags":{"INSERTION_TIME":"1699946088000001","MIN_INSERTION_TIME":"1699946088000001","MAX_INSERTION_TIME":"1699946088000001","OPTIMIZE_TARGET_SIZE":"33554432"}},{"path":"BH/part-00000-4d6e745c-8e04-48d9-aa60-438228358f1a.c000.zstd.parquet","partitionValues":{"col-173b4db9-b5ad-427f-9e75-516aae37fbbb":"BMS"},"size":890,"modificationTime":1699946088000,"dataChange":false,"stats":"{\"numRecords\":4,\"minValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Anthony Johnson\"},\"maxValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Stephanie Mcgrath\"},\"nullCount\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":0}}","tags":{"INSERTION_TIME":"1699946088000000","MIN_INSERTION_TIME":"1699946088000000","MAX_INSERTION_TIME":"1699946088000000","OPTIMIZE_TARGET_SIZE":"33554432"}}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"commitInfo":{"timestamp":1699946089972,"userId":"2797914831036774","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Overwrite","statsOnLoad":false,"partitionBy":"[\"Company Very Short\"]"},"notebook":{"notebookId":"3271485675102593"},"clusterId":"0428-070410-lm8e9giw","isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numFiles":"2","numOutputRows":"5","numOutputBytes":"1700"},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/13.3.x-photon-scala2.12","txnId":"0e8eece8-347f-4c77-bc4f-daf3a5985dc9"}}
{"metaData":{"id":"592de637-dd77-4aaa-af00-97d723a7f1f1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"Company Very Short\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.physicalName\":\"col-173b4db9-b5ad-427f-9e75-516aae37fbbb\"}},{\"name\":\"Super Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.physicalName\":\"col-3877fd94-0973-4941-ac6b-646849a1ff65\"}}]}","partitionColumns":["Company Very Short"],"configuration":{"delta.columnMapping.mode":"name","delta.autoOptimize.optimizeWrite":"true","delta.columnMapping.maxColumnId":"2","delta.targetFileSize":"33554432","delta.tuneFileSizesForRewrites":"true"},"createdTime":1699946083038}}
{"protocol":{"minReaderVersion":2,"minWriterVersion":5}}
{"add":{"path":"BH/part-00000-4d6e745c-8e04-48d9-aa60-438228358f1a.c000.zstd.parquet","partitionValues":{"col-173b4db9-b5ad-427f-9e75-516aae37fbbb":"BMS"},"size":890,"modificationTime":1699946088000,"dataChange":true,"stats":"{\"numRecords\":4,\"minValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Anthony Johnson\"},\"maxValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Stephanie Mcgrath\"},\"nullCount\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":0}}","tags":{"INSERTION_TIME":"1699946088000000","MIN_INSERTION_TIME":"1699946088000000","MAX_INSERTION_TIME":"1699946088000000","OPTIMIZE_TARGET_SIZE":"33554432"}}}
{"add":{"path":"8v/part-00001-69b4a452-aeac-4ffa-bf5c-a0c2833d05eb.c000.zstd.parquet","partitionValues":{"col-173b4db9-b5ad-427f-9e75-516aae37fbbb":"BME"},"size":810,"modificationTime":1699946088000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Timothy Lamb\"},\"maxValues\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":\"Timothy Lamb\"},\"nullCount\":{\"col-3877fd94-0973-4941-ac6b-646849a1ff65\":0}}","tags":{"INSERTION_TIME":"1699946088000001","MIN_INSERTION_TIME":"1699946088000001","MAX_INSERTION_TIME":"1699946088000001","OPTIMIZE_TARGET_SIZE":"33554432"}}}
Loading