Skip to content

Commit

Permalink
feat(arrow-json): encode Binary and LargeBinary types as hex when…
Browse files Browse the repository at this point in the history
… writing JSON (#5785)

* feat: encode Binary and LargeBinary types in JSON as hex

Added ability to the JSON writer to encode Binary and LargeBinary types
as hex. This follows the behaviour for FixedSizeBinary.

A test was added to check functionality for both Binary and LargeBinary.

* refactor: use ArrayAccessor instead of custom trait

* refactor: use generic in test instead of macro

* refactor: use const DATA_TYPE from GenericBinaryType
  • Loading branch information
hiltontj authored May 21, 2024
1 parent 3e7e701 commit b07fd5d
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 5 deletions.
83 changes: 83 additions & 0 deletions arrow-json/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1565,6 +1565,89 @@ mod tests {
Ok(())
}

fn binary_encoding_test<O: OffsetSizeTrait>() {
// set up schema
let schema = SchemaRef::new(Schema::new(vec![Field::new(
"bytes",
GenericBinaryType::<O>::DATA_TYPE,
true,
)]));

// build record batch:
let mut builder = GenericByteBuilder::<GenericBinaryType<O>>::new();
let values = [Some(b"Ned Flanders"), None, Some(b"Troy McClure")];
for value in values {
match value {
Some(v) => builder.append_value(v),
None => builder.append_null(),
}
}
let array = Arc::new(builder.finish()) as ArrayRef;
let batch = RecordBatch::try_new(schema, vec![array]).unwrap();

// encode and check JSON with explicit nulls:
{
let mut buf = Vec::new();
let json_value: Value = {
let mut writer = WriterBuilder::new()
.with_explicit_nulls(true)
.build::<_, JsonArray>(&mut buf);
writer.write(&batch).unwrap();
writer.close().unwrap();
serde_json::from_slice(&buf).unwrap()
};

assert_eq!(
json!([
{
"bytes": "4e656420466c616e64657273"
},
{
"bytes": null // the explicit null
},
{
"bytes": "54726f79204d63436c757265"
}
]),
json_value,
);
}

// encode and check JSON with no explicit nulls:
{
let mut buf = Vec::new();
let json_value: Value = {
// explicit nulls are off by default, so we don't need
// to set that when creating the writer:
let mut writer = ArrayWriter::new(&mut buf);
writer.write(&batch).unwrap();
writer.close().unwrap();
serde_json::from_slice(&buf).unwrap()
};

assert_eq!(
json!([
{
"bytes": "4e656420466c616e64657273"
},
{}, // empty because nulls are omitted
{
"bytes": "54726f79204d63436c757265"
}
]),
json_value
);
}
}

#[test]
fn test_writer_binary() {
// Binary:
binary_encoding_test::<i32>();
// LargeBinary:
binary_encoding_test::<i64>();
}

#[test]
fn test_writer_fixed_size_binary() {
// set up schema:
Expand Down
28 changes: 23 additions & 5 deletions arrow-json/src/writer/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,17 @@ fn make_encoder_impl<'a>(

DataType::FixedSizeBinary(_) => {
let array = array.as_fixed_size_binary();
(Box::new(FixedSizeBinaryEncoder::new(array)) as _, array.nulls().cloned())
(Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
}

DataType::Binary => {
let array: &BinaryArray = array.as_binary();
(Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
}

DataType::LargeBinary => {
let array: &LargeBinaryArray = array.as_binary();
(Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
}

DataType::Struct(fields) => {
Expand Down Expand Up @@ -509,15 +519,23 @@ impl<'a> Encoder for MapEncoder<'a> {
}
}

struct FixedSizeBinaryEncoder<'a>(&'a FixedSizeBinaryArray);
/// New-type wrapper for encoding the binary types in arrow: `Binary`, `LargeBinary`
/// and `FixedSizeBinary` as hex strings in JSON.
struct BinaryEncoder<B>(B);

impl<'a> FixedSizeBinaryEncoder<'a> {
fn new(array: &'a FixedSizeBinaryArray) -> Self {
impl<'a, B> BinaryEncoder<B>
where
B: ArrayAccessor<Item = &'a [u8]>,
{
fn new(array: B) -> Self {
Self(array)
}
}

impl<'a> Encoder for FixedSizeBinaryEncoder<'a> {
impl<'a, B> Encoder for BinaryEncoder<B>
where
B: ArrayAccessor<Item = &'a [u8]>,
{
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
out.push(b'"');
for byte in self.0.value(idx) {
Expand Down

0 comments on commit b07fd5d

Please sign in to comment.