Skip to content

Commit

Permalink
fix(csv)!: infer null for empty column. (#4910)
Browse files Browse the repository at this point in the history
* Infer null for empty column.

* Add test file.
  • Loading branch information
kskalski authored Oct 10, 2023
1 parent 16f5905 commit c6387c1
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 1 deletion.
62 changes: 61 additions & 1 deletion arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ impl InferredDataType {
/// Returns the inferred data type
fn get(&self) -> DataType {
match self.packed {
0 => DataType::Null,
1 => DataType::Boolean,
2 => DataType::Int64,
4 | 6 => DataType::Float64, // Promote Int64 to Float64
Expand Down Expand Up @@ -785,6 +786,9 @@ fn parse(
null_regex,
)
}
DataType::Null => {
Ok(Arc::new(NullArray::builder(rows.len()).finish()) as ArrayRef)
}
DataType::Utf8 => Ok(Arc::new(
rows.iter()
.map(|row| Some(row.get(i)))
Expand Down Expand Up @@ -1511,6 +1515,62 @@ mod tests {
assert!(!batch.column(1).is_null(4));
}

#[test]
fn test_init_nulls() {
let schema = Arc::new(Schema::new(vec![
Field::new("c_int", DataType::UInt64, true),
Field::new("c_float", DataType::Float32, true),
Field::new("c_string", DataType::Utf8, true),
Field::new("c_bool", DataType::Boolean, true),
Field::new("c_null", DataType::Null, true),
]));
let file = File::open("test/data/init_null_test.csv").unwrap();

let mut csv = ReaderBuilder::new(schema)
.has_header(true)
.build(file)
.unwrap();

let batch = csv.next().unwrap().unwrap();

assert!(batch.column(1).is_null(0));
assert!(!batch.column(1).is_null(1));
assert!(batch.column(1).is_null(2));
assert!(!batch.column(1).is_null(3));
assert!(!batch.column(1).is_null(4));
}

#[test]
fn test_init_nulls_with_inference() {
let format = Format::default().with_header(true).with_delimiter(b',');

let mut file = File::open("test/data/init_null_test.csv").unwrap();
let (schema, _) = format.infer_schema(&mut file, None).unwrap();
file.rewind().unwrap();

let expected_schema = Schema::new(vec![
Field::new("c_int", DataType::Int64, true),
Field::new("c_float", DataType::Float64, true),
Field::new("c_string", DataType::Utf8, true),
Field::new("c_bool", DataType::Boolean, true),
Field::new("c_null", DataType::Null, true),
]);
assert_eq!(schema, expected_schema);

let mut csv = ReaderBuilder::new(Arc::new(schema))
.with_format(format)
.build(file)
.unwrap();

let batch = csv.next().unwrap().unwrap();

assert!(batch.column(1).is_null(0));
assert!(!batch.column(1).is_null(1));
assert!(batch.column(1).is_null(2));
assert!(!batch.column(1).is_null(3));
assert!(!batch.column(1).is_null(4));
}

#[test]
fn test_custom_nulls() {
let schema = Arc::new(Schema::new(vec![
Expand Down Expand Up @@ -2283,7 +2343,7 @@ mod tests {
#[test]
fn test_inference() {
let cases: &[(&[&str], DataType)] = &[
(&[], DataType::Utf8),
(&[], DataType::Null),
(&["false", "12"], DataType::Utf8),
(&["12", "cupcakes"], DataType::Utf8),
(&["12", "12.4"], DataType::Float64),
Expand Down
6 changes: 6 additions & 0 deletions arrow-csv/test/data/init_null_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
c_int,c_float,c_string,c_bool,c_null
,,,,
2,2.2,"a",TRUE,
3,,"b",true,
4,4.4,,False,
5,6.6,"",FALSE,

0 comments on commit c6387c1

Please sign in to comment.