From 39da0a2e68fc794ba321ff931dab5f57d30dc003 Mon Sep 17 00:00:00 2001 From: Josh Casale Date: Mon, 15 Apr 2024 16:23:00 +0100 Subject: [PATCH 1/3] Modify decimal regex to accept positive exponent specifier --- arrow-csv/src/reader/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 5e0530289623..fc618e613a97 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -148,7 +148,7 @@ lazy_static! { static ref REGEX_SET: RegexSet = RegexSet::new([ r"(?i)^(true)$|^(false)$(?-i)", //BOOLEAN r"^-?(\d+)$", //INTEGER - r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$", //DECIMAL + r"^-?((\d*\.\d+|\d+\.\d*)([eE][-+]?\d+)?|\d+([eE][-+]?\d+))$", //DECIMAL r"^\d{4}-\d\d-\d\d$", //DATE32 r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d(?:[^\d\.].*)?$", //Timestamp(Second) r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,3}(?:[^\d].*)?$", //Timestamp(Millisecond) From aed4dae6031d131f9d226f9ac305b485907e0a66 Mon Sep 17 00:00:00 2001 From: Josh Casale Date: Mon, 15 Apr 2024 19:49:48 +0100 Subject: [PATCH 2/3] add test for positive exponent specifier --- arrow-csv/src/reader/mod.rs | 21 +++++++++++++++++++ .../test/data/scientific_notation_test.csv | 19 +++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 arrow-csv/test/data/scientific_notation_test.csv diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index fc618e613a97..943bd890536d 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -1658,6 +1658,27 @@ mod tests { assert_eq!(batch.schema().as_ref(), &expected_schema); } + #[test] + fn test_scientific_notation_with_inference() { + let mut file = File::open("test/data/scientific_notation_test.csv").unwrap(); + let format = Format::default().with_header(false).with_delimiter(b','); + + let (schema, _) = format.infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); + + let builder = ReaderBuilder::new(Arc::new(schema)) + .with_format(format) + .with_batch_size(512) + .with_projection(vec![0, 1]); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + let schema = batch.schema(); + + assert_eq!(&DataType::Float64, schema.field(0).data_type()); + } + #[test] fn test_parse_invalid_csv() { let file = File::open("test/data/various_types_invalid.csv").unwrap(); diff --git a/arrow-csv/test/data/scientific_notation_test.csv b/arrow-csv/test/data/scientific_notation_test.csv new file mode 100644 index 000000000000..d68abcb94e38 --- /dev/null +++ b/arrow-csv/test/data/scientific_notation_test.csv @@ -0,0 +1,19 @@ +1.439e+04, positive_modifier +1.31e+04, positive_modifier +1.2711e+0, positive_modifier +1.44e+04, positive_modifier +2.22e+04, positive_modifier +1.149e+04, positive_modifier +2.139e+04, positive_modifier +7.322e+04, positive_modifier +1.531e+04, positive_modifier +2.206e-04, negative_specifier +1.517e-04, negative_specifier +2.332e-04, negative_specifier +2.19e-04, negative_specifier +2.087e-04, negative_specifier +12683.18, no_modifier +7134.6, no_modifier +8540.17, no_modifier +21462.36, no_modifier +1120.76, no_modifier \ No newline at end of file From ec1ede5cbe9c928ab95fb8e3cfcecd7d723c7700 Mon Sep 17 00:00:00 2001 From: Josh Casale Date: Mon, 15 Apr 2024 19:55:05 +0100 Subject: [PATCH 3/3] nit --- .../test/data/scientific_notation_test.csv | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arrow-csv/test/data/scientific_notation_test.csv b/arrow-csv/test/data/scientific_notation_test.csv index d68abcb94e38..632c3ef8bc51 100644 --- a/arrow-csv/test/data/scientific_notation_test.csv +++ b/arrow-csv/test/data/scientific_notation_test.csv @@ -1,19 +1,19 @@ -1.439e+04, positive_modifier -1.31e+04, positive_modifier -1.2711e+0, positive_modifier -1.44e+04, positive_modifier -2.22e+04, positive_modifier -1.149e+04, positive_modifier -2.139e+04, positive_modifier -7.322e+04, positive_modifier -1.531e+04, positive_modifier -2.206e-04, negative_specifier -1.517e-04, negative_specifier -2.332e-04, negative_specifier -2.19e-04, negative_specifier -2.087e-04, negative_specifier -12683.18, no_modifier -7134.6, no_modifier -8540.17, no_modifier -21462.36, no_modifier -1120.76, no_modifier \ No newline at end of file +1.439e+04, positive_exponent +1.31e+04, positive_exponent +1.2711e+0, positive_exponent +1.44e+04, positive_exponent +2.22e+04, positive_exponent +1.149e+04, positive_exponent +2.139e+04, positive_exponent +7.322e+04, positive_exponent +1.531e+04, positive_exponent +2.206e-04, negative_exponent +1.517e-04, negative_exponent +2.332e-04, negative_exponent +2.19e-04, negative_exponent +2.087e-04, negative_exponent +12683.18, no_exponent +7134.6, no_exponent +8540.17, no_exponent +21462.36, no_exponent +1120.76, no_exponent \ No newline at end of file