Skip to content

Commit

Permalink
refactor: convert Field::metadata to HashMap (#3148)
Browse files Browse the repository at this point in the history
* refactor: convert `Field::metadata` to `HashMap`

Closes #2262.

* refactor: code formatting

Co-authored-by: Raphael Taylor-Davies <[email protected]>

Co-authored-by: Raphael Taylor-Davies <[email protected]>
  • Loading branch information
crepererum and tustvold authored Nov 21, 2022
1 parent 6f8187f commit 57f91f2
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 42 deletions.
8 changes: 4 additions & 4 deletions arrow-integration-test/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
use crate::{data_type_from_json, data_type_to_json};
use arrow::datatypes::{DataType, Field};
use arrow::error::{ArrowError, Result};
use std::collections::BTreeMap;
use std::collections::HashMap;

/// Parse a `Field` definition from a JSON representation.
pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
Expand Down Expand Up @@ -53,7 +53,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
// Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz
let metadata = match map.get("metadata") {
Some(&Value::Array(ref values)) => {
let mut res: BTreeMap<String, String> = BTreeMap::default();
let mut res: HashMap<String, String> = HashMap::default();
for value in values {
match value.as_object() {
Some(map) => {
Expand Down Expand Up @@ -92,7 +92,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
// We also support map format, because Schema's metadata supports this.
// See https://github.com/apache/arrow/pull/5907
Some(&Value::Object(ref values)) => {
let mut res: BTreeMap<String, String> = BTreeMap::default();
let mut res: HashMap<String, String> = HashMap::default();
for (k, v) in values {
if let Some(str_value) = v.as_str() {
res.insert(k.clone(), str_value.to_string().clone());
Expand All @@ -110,7 +110,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
"Field `metadata` is not json array".to_string(),
));
}
_ => BTreeMap::default(),
_ => HashMap::default(),
};

// if data_type is a struct or list, get its children
Expand Down
6 changes: 3 additions & 3 deletions arrow-ipc/src/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use arrow_schema::*;
use flatbuffers::{
FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset,
};
use std::collections::{BTreeMap, HashMap};
use std::collections::HashMap;

use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER};
use DataType::*;
Expand Down Expand Up @@ -86,7 +86,7 @@ impl<'a> From<crate::Field<'a>> for Field {
)
};

let mut metadata_map = BTreeMap::default();
let mut metadata_map = HashMap::default();
if let Some(list) = field.custom_metadata() {
for kv in list {
if let (Some(k), Some(v)) = (kv.key(), kv.value()) {
Expand Down Expand Up @@ -812,7 +812,7 @@ mod tests {
.iter()
.cloned()
.collect();
let field_md: BTreeMap<String, String> = [("k".to_string(), "v".to_string())]
let field_md: HashMap<String, String> = [("k".to_string(), "v".to_string())]
.iter()
.cloned()
.collect();
Expand Down
6 changes: 3 additions & 3 deletions arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -381,18 +381,18 @@ mod tests {
#[test]
#[cfg(feature = "serde")]
fn serde_struct_type() {
use std::collections::BTreeMap;
use std::collections::HashMap;

let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();

// Non-empty map: should be converted as JSON obj { ... }
let first_name =
Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);

// Empty map: should be omitted.
let last_name = Field::new("last_name", DataType::Utf8, false)
.with_metadata(BTreeMap::default());
.with_metadata(HashMap::default());

let person = DataType::Struct(vec![
first_name,
Expand Down
85 changes: 68 additions & 17 deletions arrow-schema/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use crate::error::ArrowError;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};

use crate::datatype::DataType;
Expand All @@ -37,9 +37,9 @@ pub struct Field {
/// A map of key-value pairs containing additional custom meta data.
#[cfg_attr(
feature = "serde",
serde(skip_serializing_if = "BTreeMap::is_empty", default)
serde(skip_serializing_if = "HashMap::is_empty", default)
)]
metadata: BTreeMap<String, String>,
metadata: HashMap<String, String>,
}

// Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered`
Expand Down Expand Up @@ -68,9 +68,33 @@ impl Ord for Field {
fn cmp(&self, other: &Self) -> Ordering {
self.name
.cmp(other.name())
.then(self.data_type.cmp(other.data_type()))
.then(self.nullable.cmp(&other.nullable))
.then(self.metadata.cmp(&other.metadata))
.then_with(|| self.data_type.cmp(other.data_type()))
.then_with(|| self.nullable.cmp(&other.nullable))
.then_with(|| {
// ensure deterministic key order
let mut keys: Vec<&String> =
self.metadata.keys().chain(other.metadata.keys()).collect();
keys.sort();
for k in keys {
match (self.metadata.get(k), other.metadata.get(k)) {
(None, None) => {}
(Some(_), None) => {
return Ordering::Less;
}
(None, Some(_)) => {
return Ordering::Greater;
}
(Some(v1), Some(v2)) => match v1.cmp(v2) {
Ordering::Equal => {}
other => {
return other;
}
},
}
}

Ordering::Equal
})
}
}

Expand All @@ -79,7 +103,14 @@ impl Hash for Field {
self.name.hash(state);
self.data_type.hash(state);
self.nullable.hash(state);
self.metadata.hash(state);

// ensure deterministic key order
let mut keys: Vec<&String> = self.metadata.keys().collect();
keys.sort();
for k in keys {
k.hash(state);
self.metadata.get(k).expect("key valid").hash(state);
}
}
}

Expand All @@ -92,7 +123,7 @@ impl Field {
nullable,
dict_id: 0,
dict_is_ordered: false,
metadata: BTreeMap::default(),
metadata: HashMap::default(),
}
}

Expand All @@ -110,29 +141,29 @@ impl Field {
nullable,
dict_id,
dict_is_ordered,
metadata: BTreeMap::default(),
metadata: HashMap::default(),
}
}

/// Sets the `Field`'s optional custom metadata.
/// The metadata is set as `None` for empty map.
#[inline]
pub fn set_metadata(&mut self, metadata: BTreeMap<String, String>) {
self.metadata = BTreeMap::default();
pub fn set_metadata(&mut self, metadata: HashMap<String, String>) {
self.metadata = HashMap::default();
if !metadata.is_empty() {
self.metadata = metadata;
}
}

/// Sets the metadata of this `Field` to be `metadata` and returns self
pub fn with_metadata(mut self, metadata: BTreeMap<String, String>) -> Self {
pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
self.set_metadata(metadata);
self
}

/// Returns the immutable reference to the `Field`'s optional custom metadata.
#[inline]
pub const fn metadata(&self) -> &BTreeMap<String, String> {
pub const fn metadata(&self) -> &HashMap<String, String> {
&self.metadata
}

Expand Down Expand Up @@ -545,10 +576,30 @@ mod test {
assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2));
}

#[test]
fn test_field_comparison_metadata() {
let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k2"), String::from("v2")),
]));
let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k3"), String::from("v3")),
]));
let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k3"), String::from("v4")),
]));

assert!(f1.cmp(&f2).is_lt());
assert!(f2.cmp(&f3).is_lt());
assert!(f1.cmp(&f3).is_lt());
}

#[test]
fn test_contains_reflexivity() {
let mut field = Field::new("field1", DataType::Float16, false);
field.set_metadata(BTreeMap::from([
field.set_metadata(HashMap::from([
(String::from("k0"), String::from("v0")),
(String::from("k1"), String::from("v1")),
]));
Expand All @@ -560,14 +611,14 @@ mod test {
let child_field = Field::new("child1", DataType::Float16, false);

let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false);
field1.set_metadata(BTreeMap::from([(String::from("k1"), String::from("v1"))]));
field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));

let mut field2 = Field::new("field1", DataType::Struct(vec![]), true);
field2.set_metadata(BTreeMap::from([(String::from("k2"), String::from("v2"))]));
field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
field2.try_merge(&field1).unwrap();

let mut field3 = Field::new("field1", DataType::Struct(vec![]), false);
field3.set_metadata(BTreeMap::from([(String::from("k3"), String::from("v3"))]));
field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
field3.try_merge(&field2).unwrap();

assert!(field2.contains(&field1));
Expand Down
23 changes: 10 additions & 13 deletions arrow-schema/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,6 @@ mod tests {
use super::*;
use crate::datatype::DataType;
use crate::{TimeUnit, UnionMode};
use std::collections::BTreeMap;

#[test]
#[cfg(feature = "serde")]
Expand Down Expand Up @@ -523,7 +522,7 @@ mod tests {

fn person_schema() -> Schema {
let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
let first_name =
Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);

Expand Down Expand Up @@ -551,18 +550,16 @@ mod tests {
#[test]
fn test_try_merge_field_with_metadata() {
// 1. Different values for the same key should cause error.
let metadata1: BTreeMap<String, String> =
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect();
let metadata1: HashMap<String, String> = [("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect();
let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1);

let metadata2: BTreeMap<String, String> =
[("foo".to_string(), "baz".to_string())]
.iter()
.cloned()
.collect();
let metadata2: HashMap<String, String> = [("foo".to_string(), "baz".to_string())]
.iter()
.cloned()
.collect();
let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);

assert!(
Expand All @@ -572,7 +569,7 @@ mod tests {

// 2. None + Some
let mut f1 = Field::new("first_name", DataType::Utf8, false);
let metadata2: BTreeMap<String, String> =
let metadata2: HashMap<String, String> =
[("missing".to_string(), "value".to_string())]
.iter()
.cloned()
Expand Down
4 changes: 2 additions & 2 deletions parquet/src/arrow/schema/complex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use std::collections::BTreeMap;
use std::collections::HashMap;

use crate::arrow::schema::primitive::convert_primitive;
use crate::arrow::ProjectionMask;
Expand Down Expand Up @@ -347,7 +347,7 @@ impl Visitor {
let value_field = convert_field(map_value, &value, arrow_value);
let field_metadata = match arrow_map {
Some(field) => field.metadata().clone(),
_ => BTreeMap::default(),
_ => HashMap::default(),
};

let map_field = Field::new(
Expand Down

0 comments on commit 57f91f2

Please sign in to comment.