From 54de613c920627d621b6fdbe1d5826c2cdbd0616 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 28 Nov 2023 20:46:58 +1100 Subject: [PATCH 1/5] JSON: write struct array nulls as null --- arrow-json/src/writer.rs | 168 ++++++++++++++++++++++++++++++--------- 1 file changed, 129 insertions(+), 39 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 4f74817ca1e3..14b2746e13b4 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -129,12 +129,20 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, explicit_nulls: bool, -) -> Result>, ArrowError> { +) -> Result>>, ArrowError> { let inner_col_names = array.column_names(); - let mut inner_objs = iter::repeat(JsonMap::new()) - .take(array.len()) - .collect::>>(); + let mut inner_objs = (0..array.len()) + // Ensure we write nulls for structarrays as nulls in JSON + // Instead of writing a struct with nulls + .map(|index| { + if array.is_null(index) { + None + } else { + Some(JsonMap::new()) + } + }) + .collect::>>>(); for (j, struct_col) in array.columns().iter().enumerate() { set_column_for_json_rows( @@ -227,7 +235,17 @@ fn array_to_json_array_internal( .collect(), DataType::Struct(_) => { let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; - Ok(jsonmaps.into_iter().map(Value::Object).collect()) + let json_values = jsonmaps + .into_iter() + .map(|maybe_map| { + if let Some(map) = maybe_map { + Value::Object(map) + } else { + Value::Null + } + }) + .collect(); + Ok(json_values) } DataType::Map(_, _) => as_map_array(array) .iter() @@ -251,6 +269,7 @@ macro_rules! set_column_by_array_type { $rows .iter_mut() .zip(arr.iter()) + .filter_map(|(maybe_row, maybe_value)| maybe_row.as_mut().map(|row| (row, maybe_value))) .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.map(Into::into) { row.insert($col_name.to_string(), j); @@ -262,7 +281,7 @@ macro_rules! set_column_by_array_type { } fn set_column_by_primitive_type( - rows: &mut [JsonMap], + rows: &mut [Option>], array: &ArrayRef, col_name: &str, explicit_nulls: bool, @@ -274,6 +293,7 @@ fn set_column_by_primitive_type( rows.iter_mut() .zip(primitive_arr.iter()) + .filter_map(|(maybe_row, maybe_value)| maybe_row.as_mut().map(|row| (row, maybe_value))) .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { row.insert(col_name.to_string(), j); @@ -284,7 +304,7 @@ fn set_column_by_primitive_type( } fn set_column_for_json_rows( - rows: &mut [JsonMap], + rows: &mut [Option>], array: &ArrayRef, col_name: &str, explicit_nulls: bool, @@ -325,9 +345,11 @@ fn set_column_for_json_rows( } DataType::Null => { if explicit_nulls { - rows.iter_mut().for_each(|row| { - row.insert(col_name.to_string(), Value::Null); - }); + rows.iter_mut() + .filter_map(|maybe_row| maybe_row.as_mut()) + .for_each(|row| { + row.insert(col_name.to_string(), Value::Null); + }); } } DataType::Boolean => { @@ -348,28 +370,43 @@ fn set_column_for_json_rows( let options = FormatOptions::default(); let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; let nulls = array.nulls(); - rows.iter_mut().enumerate().for_each(|(idx, row)| { - let maybe_value = nulls - .map(|x| x.is_valid(idx)) - .unwrap_or(true) - .then(|| formatter.value(idx).to_string().into()); - if let Some(j) = maybe_value { - row.insert(col_name.to_string(), j); - } else if explicit_nulls { - row.insert(col_name.to_string(), Value::Null); - }; - }); + rows.iter_mut() + .enumerate() + .filter_map(|(idx, maybe_row)| maybe_row.as_mut().map(|row| (idx, row))) + .for_each(|(idx, row)| { + let maybe_value = nulls + .map(|x| x.is_valid(idx)) + .unwrap_or(true) + .then(|| formatter.value(idx).to_string().into()); + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if explicit_nulls { + row.insert(col_name.to_string(), Value::Null); + } + }); } DataType::Struct(_) => { let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; - rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { - row.insert(col_name.to_string(), Value::Object(obj)); - }); + rows.iter_mut() + .zip(inner_objs) + .filter_map(|(maybe_row, maybe_obj)| maybe_row.as_mut().map(|row| (row, maybe_obj))) + .for_each(|(row, maybe_obj)| { + let json = if let Some(obj) = maybe_obj { + Value::Object(obj) + } else { + Value::Null + }; + row.insert(col_name.to_string(), json); + }); } DataType::List(_) => { let listarr = as_list_array(array); - rows.iter_mut().zip(listarr.iter()).try_for_each( - |(row, maybe_value)| -> Result<(), ArrowError> { + rows.iter_mut() + .zip(listarr.iter()) + .filter_map(|(maybe_row, maybe_value)| { + maybe_row.as_mut().map(|row| (row, maybe_value)) + }) + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) .transpose()?; @@ -379,13 +416,16 @@ fn set_column_for_json_rows( row.insert(col_name.to_string(), Value::Null); } Ok(()) - }, - )?; + })?; } DataType::LargeList(_) => { let listarr = as_large_list_array(array); - rows.iter_mut().zip(listarr.iter()).try_for_each( - |(row, maybe_value)| -> Result<(), ArrowError> { + rows.iter_mut() + .zip(listarr.iter()) + .filter_map(|(maybe_row, maybe_value)| { + maybe_row.as_mut().map(|row| (row, maybe_value)) + }) + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) .transpose()?; @@ -395,8 +435,7 @@ fn set_column_for_json_rows( row.insert(col_name.to_string(), Value::Null); } Ok(()) - }, - )?; + })?; } DataType::Dictionary(_, value_type) => { let hydrated = arrow_cast::cast::cast(&array, value_type) @@ -422,7 +461,11 @@ fn set_column_for_json_rows( let mut kv = keys.iter().zip(values); - for (i, row) in rows.iter_mut().enumerate() { + for (i, row) in rows + .iter_mut() + .enumerate() + .filter_map(|(i, maybe_row)| maybe_row.as_mut().map(|row| (i, row))) + { if maparr.is_null(i) { row.insert(col_name.to_string(), serde_json::Value::Null); continue; @@ -461,7 +504,7 @@ fn record_batches_to_json_rows_internal( batches: &[&RecordBatch], explicit_nulls: bool, ) -> Result>, ArrowError> { - let mut rows: Vec> = iter::repeat(JsonMap::new()) + let mut rows: Vec>> = iter::repeat(Some(JsonMap::new())) .take(batches.iter().map(|b| b.num_rows()).sum()) .collect(); @@ -479,6 +522,7 @@ fn record_batches_to_json_rows_internal( } } + let rows = rows.into_iter().map(|a| a.unwrap()).collect::>(); Ok(rows) } @@ -1478,10 +1522,6 @@ mod tests { writer.write_batches(&[&batch]).unwrap(); } - // NOTE: The last value should technically be {"list": [null]} but it appears - // that implementations differ on the treatment of a null struct. - // It would be more accurate to return a null struct, so this can be done - // as a follow up. assert_json_eq( &buf, r#"{"list":[{"ints":1}]} @@ -1489,7 +1529,57 @@ mod tests { {"list":[]} {} {"list":[{}]} -{"list":[{}]} +{"list":[null]} +"#, + ); + } + + #[test] + fn json_struct_array_logical_nulls() { + let inner = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![None]), + Some(vec![]), + Some(vec![Some(3), None]), // masked for a + Some(vec![Some(4), Some(5)]), + None, // masked for b + None, + ]); + + let field = Arc::new(Field::new("list", inner.data_type().clone(), true)); + let array = Arc::new(inner) as ArrayRef; + let struct_array_a = StructArray::from(( + vec![(field.clone(), array.clone())], + Buffer::from([0b01010111]), + )); + let struct_array_b = StructArray::from(vec![(field, array)]); + + let schema = Schema::new(vec![ + Field::new_struct("a", struct_array_a.fields().clone(), true), + Field::new_struct("b", struct_array_b.fields().clone(), true), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(struct_array_a), Arc::new(struct_array_b)], + ) + .unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } + + assert_json_eq( + &buf, + r#"{"a":{"list":[1,2]},"b":{"list":[1,2]}} +{"a":{"list":[null]},"b":{"list":[null]}} +{"a":{"list":[]},"b":{"list":[]}} +{"a":null,"b":{"list":[3,null]}} +{"a":{"list":[4,5]},"b":{"list":[4,5]}} +{"a":null,"b":{}} +{"a":{},"b":{}} "#, ); } From 720f6e7fd87554b4d88ebd9a1e446cc949988e56 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 28 Nov 2023 20:50:33 +1100 Subject: [PATCH 2/5] Fix --- arrow-json/src/writer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 14b2746e13b4..5a703b7281dd 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -133,7 +133,7 @@ fn struct_array_to_jsonmap_array( let inner_col_names = array.column_names(); let mut inner_objs = (0..array.len()) - // Ensure we write nulls for structarrays as nulls in JSON + // Ensure we write nulls for struct arrays as nulls in JSON // Instead of writing a struct with nulls .map(|index| { if array.is_null(index) { @@ -1535,7 +1535,7 @@ mod tests { } #[test] - fn json_struct_array_logical_nulls() { + fn json_struct_array_nulls() { let inner = ListArray::from_iter_primitive::(vec![ Some(vec![Some(1), Some(2)]), Some(vec![None]), From 509e9eb261f6aa47603ed94925728ceb0f5fb175 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 28 Nov 2023 20:57:25 +1100 Subject: [PATCH 3/5] Fix --- arrow-json/src/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 5a703b7281dd..9ff8572f3769 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1542,7 +1542,7 @@ mod tests { Some(vec![]), Some(vec![Some(3), None]), // masked for a Some(vec![Some(4), Some(5)]), - None, // masked for b + None, // masked for a None, ]); From fc1e6aea37fbc13a548b70ff8aa6afdd1e7ae742 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 29 Nov 2023 07:46:45 +1100 Subject: [PATCH 4/5] Update arrow-json/src/writer.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-json/src/writer.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 9ff8572f3769..6e430ea6e077 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -136,11 +136,7 @@ fn struct_array_to_jsonmap_array( // Ensure we write nulls for struct arrays as nulls in JSON // Instead of writing a struct with nulls .map(|index| { - if array.is_null(index) { - None - } else { - Some(JsonMap::new()) - } + array.is_valid(index).then(JsonMap::new) }) .collect::>>>(); From 1bceed8a7e61d8d22f2317a26ae99d7e34208ba6 Mon Sep 17 00:00:00 2001 From: Jefffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 29 Nov 2023 07:48:40 +1100 Subject: [PATCH 5/5] Refactoring --- arrow-json/src/writer.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 6e430ea6e077..cabda5e2dca8 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -135,9 +135,7 @@ fn struct_array_to_jsonmap_array( let mut inner_objs = (0..array.len()) // Ensure we write nulls for struct arrays as nulls in JSON // Instead of writing a struct with nulls - .map(|index| { - array.is_valid(index).then(JsonMap::new) - }) + .map(|index| array.is_valid(index).then(JsonMap::new)) .collect::>>>(); for (j, struct_col) in array.columns().iter().enumerate() { @@ -233,13 +231,7 @@ fn array_to_json_array_internal( let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; let json_values = jsonmaps .into_iter() - .map(|maybe_map| { - if let Some(map) = maybe_map { - Value::Object(map) - } else { - Value::Null - } - }) + .map(|maybe_map| maybe_map.map(Value::Object).unwrap_or(Value::Null)) .collect(); Ok(json_values) }