From f7c72ae942a7008a528045f357af5aa0bc932fa7 Mon Sep 17 00:00:00 2001 From: Vaibhav Date: Mon, 11 Sep 2023 18:52:01 +0530 Subject: [PATCH] Use regex instead of hash-maps --- arrow-csv/src/reader/mod.rs | 35 ++++++++++++------------ arrow-csv/test/data/custom_null_test.csv | 2 +- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index faaf8d3aed9..544c3a6cd60 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -133,8 +133,7 @@ use arrow_schema::*; use chrono::{TimeZone, Utc}; use csv::StringRecord; use lazy_static::lazy_static; -use regex::RegexSet; -use std::collections::HashSet; +use regex::{Regex, RegexSet}; use std::fmt::{self, Debug}; use std::fs::File; use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; @@ -214,7 +213,7 @@ pub struct Format { escape: Option, quote: Option, terminator: Option, - nulls: HashSet, + null_regex: Option, } impl Format { @@ -243,8 +242,8 @@ impl Format { self } - pub fn with_nulls(mut self, nulls: HashSet) -> Self { - self.nulls = nulls; + pub fn with_null_regex(mut self, null_regex: Regex) -> Self { + self.null_regex = Some(null_regex); self } @@ -326,6 +325,7 @@ impl Format { if let Some(t) = self.terminator { builder.terminator(csv::Terminator::Any(t)); } + // TODO: Null regex builder.from_reader(reader) } @@ -343,6 +343,7 @@ impl Format { if let Some(t) = self.terminator { builder.terminator(csv_core::Terminator::Any(t)); } + // TODO: Null regex builder.build() } } @@ -564,7 +565,7 @@ pub struct Decoder { /// A decoder for [`StringRecords`] record_decoder: RecordDecoder, - /// Check for if the string is `NULL` value or not. + /// Check if the string matches this pattern for `NULL`. is_null: Box bool>, } @@ -1029,8 +1030,8 @@ pub struct ReaderBuilder { bounds: Bounds, /// Optional projection for which columns to load (zero-based column indices) projection: Option>, - /// Strings to consider as `NULL` when parsing. - nulls: HashSet, + /// Pattern to consider as `NULL` when parsing. + null_regex: Option, } impl ReaderBuilder { @@ -1062,7 +1063,7 @@ impl ReaderBuilder { batch_size: 1024, bounds: None, projection: None, - nulls: HashSet::new(), + null_regex: None, } } @@ -1099,8 +1100,8 @@ impl ReaderBuilder { self } - pub fn with_nulls(mut self, nulls: HashSet) -> Self { - self.nulls = nulls; + pub fn with_null_regex(mut self, null_regex: Regex) -> Self { + self.null_regex = Some(null_regex); self } @@ -1154,11 +1155,11 @@ impl ReaderBuilder { None => (header, usize::MAX), }; - let is_null: Box bool> = if self.nulls.is_empty() { - Box::new(|s| s.is_empty()) + let is_null: Box bool> = if let Some(null_regex) = self.null_regex + { + Box::new(move |s| s.is_empty() || null_regex.is_match(s)) } else { - let nulls = self.nulls; - Box::new(move |s| s.is_empty() || nulls.contains(s)) + Box::new(|s| s.is_empty()) }; Decoder { @@ -1507,11 +1508,11 @@ mod tests { let file = File::open("test/data/custom_null_test.csv").unwrap(); - let nulls: HashSet = ["nil"].into_iter().map(|s| s.to_string()).collect(); + let null_regex = Regex::new("^nil$").unwrap(); let mut csv = ReaderBuilder::new(schema) .has_header(true) - .with_nulls(nulls) + .with_null_regex(null_regex) .build(file) .unwrap(); diff --git a/arrow-csv/test/data/custom_null_test.csv b/arrow-csv/test/data/custom_null_test.csv index 30d7b7f2a1b..747cd25f51e 100644 --- a/arrow-csv/test/data/custom_null_test.csv +++ b/arrow-csv/test/data/custom_null_test.csv @@ -3,4 +3,4 @@ c_int,c_float,c_string,c_bool nil,2.2,"2.22",TRUE 3,nil,"3.33",true 4,4.4,nil,False -5,6.6,"",nil \ No newline at end of file +5,6.6,"",nil