Skip to content

Commit

Permalink
feat: support encoding of binary in CSV writer
Browse files Browse the repository at this point in the history
Allows for writing binary (Binary, LargeBinary, and FixedSizeBinary) to
CSV. Note: FixedSizeBinary was already being supported in this way.

Values are encoded as HEX, by using the default Arrow formatter.

A test was added that accounts for null values when encoding all three
binary types in CSV.
  • Loading branch information
hiltontj committed May 17, 2024
1 parent dfe0f26 commit 05c8f48
Showing 1 changed file with 64 additions and 10 deletions.
74 changes: 64 additions & 10 deletions arrow-csv/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,15 @@ impl<W: Write> Writer<W> {
let converters = batch
.columns()
.iter()
.map(|a| match a.data_type() {
d if d.is_nested() => Err(ArrowError::CsvError(format!(
"Nested type {} is not supported in CSV",
a.data_type()
))),
DataType::Binary | DataType::LargeBinary => Err(ArrowError::CsvError(
"Binary data cannot be written to CSV".to_string(),
)),
_ => ArrayFormatter::try_new(a.as_ref(), &options),
.map(|a| {
if a.data_type().is_nested() {
Err(ArrowError::CsvError(format!(
"Nested type {} is not supported in CSV",
a.data_type()
)))
} else {
ArrayFormatter::try_new(a.as_ref(), &options)
}
})
.collect::<Result<Vec<_>, ArrowError>>()?;

Expand Down Expand Up @@ -425,7 +425,10 @@ mod tests {
use super::*;

use crate::ReaderBuilder;
use arrow_array::builder::{Decimal128Builder, Decimal256Builder};
use arrow_array::builder::{
BinaryBuilder, Decimal128Builder, Decimal256Builder, FixedSizeBinaryBuilder,
LargeBinaryBuilder,
};
use arrow_array::types::*;
use arrow_buffer::i256;
use std::io::{Cursor, Read, Seek};
Expand Down Expand Up @@ -759,4 +762,55 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
String::from_utf8(buffer).unwrap()
);
}

#[test]
fn test_write_csv_binary() {
let fixed_size = 8;
let schema = SchemaRef::new(Schema::new(vec![
Field::new("c1", DataType::Binary, true),
Field::new("c2", DataType::FixedSizeBinary(fixed_size), true),
Field::new("c3", DataType::LargeBinary, true),
]));
let mut c1_builder = BinaryBuilder::new();
c1_builder.append_value(b"Homer");
c1_builder.append_value(b"Bart");
c1_builder.append_null();
c1_builder.append_value(b"Ned");
let mut c2_builder = FixedSizeBinaryBuilder::new(fixed_size);
c2_builder.append_value(b"Simpson ").unwrap();
c2_builder.append_value(b"Simpson ").unwrap();
c2_builder.append_null();
c2_builder.append_value(b"Flanders").unwrap();
let mut c3_builder = LargeBinaryBuilder::new();
c3_builder.append_null();
c3_builder.append_null();
c3_builder.append_value(b"Comic Book Guy");
c3_builder.append_null();

let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(c1_builder.finish()) as ArrayRef,
Arc::new(c2_builder.finish()) as ArrayRef,
Arc::new(c3_builder.finish()) as ArrayRef,
],
)
.unwrap();

let mut buf = Vec::new();
let builder = WriterBuilder::new();
let mut writer = builder.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);
assert_eq!(
"\
c1,c2,c3\n\
486f6d6572,53696d70736f6e20,\n\
42617274,53696d70736f6e20,\n\
,,436f6d696320426f6f6b20477579\n\
4e6564,466c616e64657273,\n\
",
String::from_utf8(buf).unwrap()
);
}
}

0 comments on commit 05c8f48

Please sign in to comment.