Skip to content

Commit

Permalink
Use regex instead of hash-maps
Browse files Browse the repository at this point in the history
  • Loading branch information
vrongmeal committed Sep 11, 2023
1 parent ed47efe commit f7c72ae
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 18 deletions.
35 changes: 18 additions & 17 deletions arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ use arrow_schema::*;
use chrono::{TimeZone, Utc};
use csv::StringRecord;
use lazy_static::lazy_static;
use regex::RegexSet;
use std::collections::HashSet;
use regex::{Regex, RegexSet};
use std::fmt::{self, Debug};
use std::fs::File;
use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom};
Expand Down Expand Up @@ -214,7 +213,7 @@ pub struct Format {
escape: Option<u8>,
quote: Option<u8>,
terminator: Option<u8>,
nulls: HashSet<String>,
null_regex: Option<Regex>,
}

impl Format {
Expand Down Expand Up @@ -243,8 +242,8 @@ impl Format {
self
}

pub fn with_nulls(mut self, nulls: HashSet<String>) -> Self {
self.nulls = nulls;
pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
self.null_regex = Some(null_regex);
self
}

Expand Down Expand Up @@ -326,6 +325,7 @@ impl Format {
if let Some(t) = self.terminator {
builder.terminator(csv::Terminator::Any(t));
}
// TODO: Null regex
builder.from_reader(reader)
}

Expand All @@ -343,6 +343,7 @@ impl Format {
if let Some(t) = self.terminator {
builder.terminator(csv_core::Terminator::Any(t));
}
// TODO: Null regex
builder.build()
}
}
Expand Down Expand Up @@ -564,7 +565,7 @@ pub struct Decoder {
/// A decoder for [`StringRecords`]
record_decoder: RecordDecoder,

/// Check for if the string is `NULL` value or not.
/// Check if the string matches this pattern for `NULL`.
is_null: Box<dyn Fn(&str) -> bool>,
}

Expand Down Expand Up @@ -1029,8 +1030,8 @@ pub struct ReaderBuilder {
bounds: Bounds,
/// Optional projection for which columns to load (zero-based column indices)
projection: Option<Vec<usize>>,
/// Strings to consider as `NULL` when parsing.
nulls: HashSet<String>,
/// Pattern to consider as `NULL` when parsing.
null_regex: Option<Regex>,
}

impl ReaderBuilder {
Expand Down Expand Up @@ -1062,7 +1063,7 @@ impl ReaderBuilder {
batch_size: 1024,
bounds: None,
projection: None,
nulls: HashSet::new(),
null_regex: None,
}
}

Expand Down Expand Up @@ -1099,8 +1100,8 @@ impl ReaderBuilder {
self
}

pub fn with_nulls(mut self, nulls: HashSet<String>) -> Self {
self.nulls = nulls;
pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
self.null_regex = Some(null_regex);
self
}

Expand Down Expand Up @@ -1154,11 +1155,11 @@ impl ReaderBuilder {
None => (header, usize::MAX),
};

let is_null: Box<dyn Fn(&str) -> bool> = if self.nulls.is_empty() {
Box::new(|s| s.is_empty())
let is_null: Box<dyn Fn(&str) -> bool> = if let Some(null_regex) = self.null_regex
{
Box::new(move |s| s.is_empty() || null_regex.is_match(s))
} else {
let nulls = self.nulls;
Box::new(move |s| s.is_empty() || nulls.contains(s))
Box::new(|s| s.is_empty())
};

Decoder {
Expand Down Expand Up @@ -1507,11 +1508,11 @@ mod tests {

let file = File::open("test/data/custom_null_test.csv").unwrap();

let nulls: HashSet<String> = ["nil"].into_iter().map(|s| s.to_string()).collect();
let null_regex = Regex::new("^nil$").unwrap();

let mut csv = ReaderBuilder::new(schema)
.has_header(true)
.with_nulls(nulls)
.with_null_regex(null_regex)
.build(file)
.unwrap();

Expand Down
2 changes: 1 addition & 1 deletion arrow-csv/test/data/custom_null_test.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ c_int,c_float,c_string,c_bool
nil,2.2,"2.22",TRUE
3,nil,"3.33",true
4,4.4,nil,False
5,6.6,"",nil
5,6.6,"",nil

0 comments on commit f7c72ae

Please sign in to comment.