Skip to content

Commit

Permalink
enhancement: implement generic flattening (#1017)
Browse files Browse the repository at this point in the history
this helps to flatten the nested JSON
such that all list type fields gets converted to primitive type
hence splitting the repeating array to multiple rows

e.g.

`[
        {
            "id": 1,
            "name": "John Doe",
            "addresses": [
                {
                    "street": "123 Main St",
                    "city": "Springfield",
                    "state": "IL",
                    "zip": "62701"
                },
                {
                    "street": "456 Elm St",
                    "city": "Springfield",
                    "state": "IL",
                    "zip": "62702"
                }
            ]
        }
    ]
`

gets converted to below

`
[
		{
			"id": 1,
            "name": "John Doe",
			"addresses_street": "123 Main St",
			"addresses_city": "Springfield",
			"addresses_state": "IL",
			"addresses_zip": "62701",

		},
		{
			"id": 1,
            "name": "John Doe",
			"addresses_street": "456 Elm St",
			"addresses_city": "Springfield",
			"addresses_state": "IL",
			"addresses_zip": "62702",
		}
	]
`
  • Loading branch information
nikhilsinhaparseable authored Dec 6, 2024
1 parent 199ebfd commit 935ee79
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 26 deletions.
23 changes: 5 additions & 18 deletions src/handlers/http/ingest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,7 @@ mod tests {
use std::{collections::HashMap, sync::Arc};

use actix_web::test::TestRequest;
use arrow_array::{
types::Int64Type, ArrayRef, Float64Array, Int64Array, ListArray, StringArray,
};
use arrow_array::{ArrayRef, Float64Array, Int64Array, StringArray};
use arrow_schema::{DataType, Field};
use serde_json::json;

Expand Down Expand Up @@ -689,25 +687,14 @@ mod tests {
])
);

let c_a = vec![None, None, Some(vec![Some(1i64)]), Some(vec![Some(1)])];
let c_b = vec![None, None, None, Some(vec![Some(2i64)])];

assert_eq!(
rb.column_by_name("c_a")
.unwrap()
.as_any()
.downcast_ref::<ListArray>()
.unwrap(),
&ListArray::from_iter_primitive::<Int64Type, _, _>(c_a)
rb.column_by_name("c_a").unwrap().as_int64_arr(),
&Int64Array::from(vec![None, None, Some(1), Some(1)])
);

assert_eq!(
rb.column_by_name("c_b")
.unwrap()
.as_any()
.downcast_ref::<ListArray>()
.unwrap(),
&ListArray::from_iter_primitive::<Int64Type, _, _>(c_b)
rb.column_by_name("c_b").unwrap().as_int64_arr(),
&Int64Array::from(vec![None, None, None, Some(2)])
);
}
}
61 changes: 61 additions & 0 deletions src/utils/json/flatten.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,67 @@ pub fn flatten_array_objects(
Ok(())
}

pub fn flatten_json(value: &Value) -> Vec<Value> {
match value {
Value::Array(arr) => {
let mut results = Vec::new();
for item in arr {
results.extend(flatten_json(item));
}
results
}
Value::Object(map) => {
let mut results = vec![map.clone()];
for (key, val) in map {
if matches!(val, Value::Array(_)) {
if let Value::Array(arr) = val {
let mut new_results = Vec::new();
for item in arr {
let flattened_items = flatten_json(item);
for flattened_item in flattened_items {
for result in &results {
let mut new_obj = result.clone();
new_obj.insert(key.clone(), flattened_item.clone());
new_results.push(new_obj);
}
}
}
results = new_results;
}
} else if matches!(val, Value::Object(_)) {
let nested_results = flatten_json(val);
let mut new_results = Vec::new();
for nested_result in nested_results {
for result in &results {
let mut new_obj = result.clone();
new_obj.insert(key.clone(), nested_result.clone());
new_results.push(new_obj);
}
}
results = new_results;
}
}
results.into_iter().map(Value::Object).collect()
}
_ => vec![value.clone()],
}
}

pub fn convert_to_array(flattened: Vec<Value>) -> Result<Value, anyhow::Error> {
let mut result = Vec::new();
for item in flattened {
let mut map = Map::new();
if let Some(item) = item.as_object() {
for (key, value) in item {
map.insert(key.clone(), value.clone());
}
result.push(Value::Object(map));
} else {
return Err(anyhow!("Expected object in array of objects"));
}
}
Ok(Value::Array(result))
}
#[cfg(test)]
mod tests {
use crate::utils::json::flatten::flatten_array_objects;
Expand Down
19 changes: 11 additions & 8 deletions src/utils/json/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,17 @@ pub fn flatten_json_body(
custom_partition: Option<String>,
validation_required: bool,
) -> Result<Value, anyhow::Error> {
flatten::flatten(
body,
"_",
time_partition,
time_partition_limit,
custom_partition,
validation_required,
)
match flatten::convert_to_array(flatten::flatten_json(&body)) {
Ok(nested_value) => flatten::flatten(
nested_value,
"_",
time_partition,
time_partition_limit,
custom_partition,
validation_required,
),
Err(err) => Err(err),
}
}

pub fn convert_array_to_object(
Expand Down

0 comments on commit 935ee79

Please sign in to comment.