String to decimal conversion written using E/scientific notation (#5611)

* Added decimal casting for the E notation * Clippy * Fmt * Added several new tests Fixed error e without fractional part (e.g. 0e-8, 1e7) * Clippy, fmt * parse_e_notation function minor rework added additional tests for e/scientific notation
apache · Apr 15, 2024 · d84a1a6 · d84a1a6
1 parent 89767cc
commit d84a1a6
Showing 1 changed file with 279 additions and 15 deletions.
diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
@@ -671,6 +671,126 @@ impl Parser for Date64Type {
     }
 }
 
+fn parse_e_notation<T: DecimalType>(
+    s: &str,
+    mut digits: u16,
+    mut fractionals: i16,
+    mut result: T::Native,
+    index: usize,
+    precision: u16,
+    scale: i16,
+) -> Result<T::Native, ArrowError> {
+    let mut exp: i16 = 0;
+    let base = T::Native::usize_as(10);
+
+    let mut exp_start: bool = false;
+    // e has a plus sign
+    let mut pos_shift_direction: bool = true;
+
+    // skip to point or exponent index
+    let mut bs;
+    if fractionals > 0 {
+        // it's a fraction, so the point index needs to be skipped, so +1
+        bs = s.as_bytes().iter().skip(index + fractionals as usize + 1);
+    } else {
+        // it's actually an integer that is already written into the result, so let's skip on to e
+        bs = s.as_bytes().iter().skip(index);
+    }
+
+    while let Some(b) = bs.next() {
+        match b {
+            b'0'..=b'9' => {
+                result = result.mul_wrapping(base);
+                result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize));
+                if fractionals > 0 {
+                    fractionals += 1;
+                }
+                digits += 1;
+            }
+            &b'e' | &b'E' => {
+                exp_start = true;
+            }
+            _ => {
+                return Err(ArrowError::ParseError(format!(
+                    "can't parse the string value {s} to decimal"
+                )));
+            }
+        };
+
+        if exp_start {
+            pos_shift_direction = match bs.next() {
+                Some(&b'-') => false,
+                Some(&b'+') => true,
+                Some(b) => {
+                    if !b.is_ascii_digit() {
+                        return Err(ArrowError::ParseError(format!(
+                            "can't parse the string value {s} to decimal"
+                        )));
+                    }
+
+                    exp *= 10;
+                    exp += (b - b'0') as i16;
+
+                    true
+                }
+                None => {
+                    return Err(ArrowError::ParseError(format!(
+                        "can't parse the string value {s} to decimal"
+                    )))
+                }
+            };
+
+            for b in bs.by_ref() {
+                if !b.is_ascii_digit() {
+                    return Err(ArrowError::ParseError(format!(
+                        "can't parse the string value {s} to decimal"
+                    )));
+                }
+                exp *= 10;
+                exp += (b - b'0') as i16;
+            }
+        }
+    }
+
+    if digits == 0 && fractionals == 0 && exp == 0 {
+        return Err(ArrowError::ParseError(format!(
+            "can't parse the string value {s} to decimal"
+        )));
+    }
+
+    if !pos_shift_direction {
+        // exponent has a large negative sign
+        // 1.12345e-30 => 0.0{29}12345, scale = 5
+        if exp - (digits as i16 + scale) > 0 {
+            return Ok(T::Native::usize_as(0));
+        }
+        exp *= -1;
+    }
+
+    // point offset
+    exp = fractionals - exp;
+    // We have zeros on the left, we need to count them
+    if !pos_shift_direction && exp > digits as i16 {
+        digits = exp as u16;
+    }
+    // Number of numbers to be removed or added
+    exp = scale - exp;
+
+    if (digits as i16 + exp) as u16 > precision {
+        return Err(ArrowError::ParseError(format!(
+            "parse decimal overflow ({s})"
+        )));
+    }
+
+    if exp < 0 {
+        result = result.div_wrapping(base.pow_wrapping(-exp as _));
+    } else {
+        result = result.mul_wrapping(base.pow_wrapping(exp as _));
+    }
+
+    Ok(result)
+}
+
 /// Parse the string format decimal value to i128/i256 format and checking the precision and scale.
 /// The result value can't be out of bounds.
 pub fn parse_decimal<T: DecimalType>(
@@ -679,8 +799,8 @@ pub fn parse_decimal<T: DecimalType>(
     scale: i8,
 ) -> Result<T::Native, ArrowError> {
     let mut result = T::Native::usize_as(0);
-    let mut fractionals = 0;
-    let mut digits = 0;
+    let mut fractionals: i8 = 0;
+    let mut digits: u8 = 0;
     let base = T::Native::usize_as(10);
 
     let bs = s.as_bytes();
@@ -696,10 +816,13 @@ pub fn parse_decimal<T: DecimalType>(
         )));
     }
 
-    let mut bs = bs.iter();
+    let mut bs = bs.iter().enumerate();
+
+    let mut is_e_notation = false;
+
     // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds.
     // Thus, if we validate the precision correctly, we can skip overflow checks.
-    while let Some(b) = bs.next() {
+    while let Some((index, b)) = bs.next() {
         match b {
             b'0'..=b'9' => {
                 if digits == 0 && *b == b'0' {
@@ -711,8 +834,28 @@ pub fn parse_decimal<T: DecimalType>(
                 result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize));
             }
             b'.' => {
-                for b in bs.by_ref() {
+                let point_index = index;
+
+                for (_, b) in bs.by_ref() {
                     if !b.is_ascii_digit() {
+                        if *b == b'e' || *b == b'E' {
+                            result = match parse_e_notation::<T>(
+                                s,
+                                digits as u16,
+                                fractionals as i16,
+                                result,
+                                point_index,
+                                precision as u16,
+                                scale as i16,
+                            ) {
+                                Err(e) => return Err(e),
+                                Ok(v) => v,
+                            };
+
+                            is_e_notation = true;
+
+                            break;
+                        }
                         return Err(ArrowError::ParseError(format!(
                             "can't parse the string value {s} to decimal"
                         )));
@@ -729,13 +872,35 @@ pub fn parse_decimal<T: DecimalType>(
                     result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize));
                 }
 
+                if is_e_notation {
+                    break;
+                }
+
                 // Fail on "."
                 if digits == 0 {
                     return Err(ArrowError::ParseError(format!(
                         "can't parse the string value {s} to decimal"
                     )));
                 }
             }
+            b'e' | b'E' => {
+                result = match parse_e_notation::<T>(
+                    s,
+                    digits as u16,
+                    fractionals as i16,
+                    result,
+                    index,
+                    precision as u16,
+                    scale as i16,
+                ) {
+                    Err(e) => return Err(e),
+                    Ok(v) => v,
+                };
+
+                is_e_notation = true;
+
+                break;
+            }
             _ => {
                 return Err(ArrowError::ParseError(format!(
                     "can't parse the string value {s} to decimal"
@@ -744,15 +909,21 @@ pub fn parse_decimal<T: DecimalType>(
         }
     }
 
-    if fractionals < scale {
-        let exp = scale - fractionals;
-        if exp as u8 + digits > precision {
-            return Err(ArrowError::ParseError("parse decimal overflow".to_string()));
+    if !is_e_notation {
+        if fractionals < scale {
+            let exp = scale - fractionals;
+            if exp as u8 + digits > precision {
+                return Err(ArrowError::ParseError(format!(
+                    "parse decimal overflow ({s})"
+                )));
+            }
+            let mul = base.pow_wrapping(exp as _);
+            result = result.mul_wrapping(mul);
+        } else if digits > precision {
+            return Err(ArrowError::ParseError(format!(
+                "parse decimal overflow ({s})"
+            )));
         }
-        let mul = base.pow_wrapping(exp as _);
-        result = result.mul_wrapping(mul);
-    } else if digits > precision {
-        return Err(ArrowError::ParseError("parse decimal overflow".to_string()));
     }
 
     Ok(if negative {
@@ -2202,7 +2373,65 @@ mod tests {
             let result_256 = parse_decimal::<Decimal256Type>(s, 20, 3);
             assert_eq!(i256::from_i128(i), result_256.unwrap());
         }
-        let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"];
+
+        let e_notation_tests = [
+            ("1.23e3", "1230.0", 2),
+            ("5.6714e+2", "567.14", 4),
+            ("5.6714e-2", "0.056714", 4),
+            ("5.6714e-2", "0.056714", 3),
+            ("5.6741214125e2", "567.41214125", 4),
+            ("8.91E4", "89100.0", 2),
+            ("3.14E+5", "314000.0", 2),
+            ("2.718e0", "2.718", 2),
+            ("9.999999e-1", "0.9999999", 4),
+            ("1.23e+3", "1230", 2),
+            ("1.234559e+3", "1234.559", 2),
+            ("1.00E-10", "0.0000000001", 11),
+            ("1.23e-4", "0.000123", 2),
+            ("9.876e7", "98760000.0", 2),
+            ("5.432E+8", "543200000.0", 10),
+            ("1.234567e9", "1234567000.0", 2),
+            ("1.234567e2", "123.45670000", 2),
+            ("4749.3e-5", "0.047493", 10),
+            ("4749.3e+5", "474930000", 10),
+            ("4749.3e-5", "0.047493", 1),
+            ("4749.3e+5", "474930000", 1),
+            ("0E-8", "0", 10),
+            ("0E+6", "0", 10),
+            ("1E-8", "0.00000001", 10),
+            ("12E+6", "12000000", 10),
+            ("12E-6", "0.000012", 10),
+            ("0.1e-6", "0.0000001", 10),
+            ("0.1e+6", "100000", 10),
+            ("0.12e-6", "0.00000012", 10),
+            ("0.12e+6", "120000", 10),
+            ("000000000001e0", "000000000001", 3),
+            ("000001.1034567002e0", "000001.1034567002", 3),
+        ];
+        for (e, d, scale) in e_notation_tests {
+            let result_128_e = parse_decimal::<Decimal128Type>(e, 20, scale);
+            let result_128_d = parse_decimal::<Decimal128Type>(d, 20, scale);
+            assert_eq!(result_128_e.unwrap(), result_128_d.unwrap());
+            let result_256_e = parse_decimal::<Decimal256Type>(e, 20, scale);
+            let result_256_d = parse_decimal::<Decimal256Type>(d, 20, scale);
+            assert_eq!(result_256_e.unwrap(), result_256_d.unwrap());
+        }
+        let can_not_parse_tests = [
+            "123,123",
+            ".",
+            "123.123.123",
+            "",
+            "+",
+            "-",
+            "e",
+            "1.3e+e3",
+            "5.6714ee-2",
+            "4.11ee-+4",
+            "4.11e++4",
+            "1.1e.12",
+            "1.23e+3.",
+            "1.23e+3.1",
+        ];
         for s in can_not_parse_tests {
             let result_128 = parse_decimal::<Decimal128Type>(s, 20, 3);
             assert_eq!(
@@ -2215,7 +2444,16 @@ mod tests {
                 result_256.unwrap_err().to_string()
             );
         }
-        let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"];
+        let overflow_parse_tests = [
+            "12345678",
+            "1.2345678e7",
+            "12345678.9",
+            "1.23456789e+7",
+            "99999999.99",
+            "9.999999999e7",
+            "12345678908765.123456",
+            "123456789087651234.56e-4",
+        ];
         for s in overflow_parse_tests {
             let result_128 = parse_decimal::<Decimal128Type>(s, 10, 3);
             let expected_128 = "Parser error: parse decimal overflow";
@@ -2262,6 +2500,16 @@ mod tests {
                 99999999999999999999999999999999999999i128,
                 38,
             ),
+            (
+                "0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001016744",
+                0i128,
+                15,
+            ),
+            (
+                "1.016744e-320",
+                0i128,
+                15,
+            ),
         ];
         for (s, i, scale) in edge_tests_128 {
             let result_128 = parse_decimal::<Decimal128Type>(s, 38, scale);
@@ -2292,6 +2540,14 @@ mod tests {
                 .unwrap(),
                 26,
             ),
+            (
+                "9.999999999999999999999999999999999999999999999999999999999999999999999999999e49",
+                i256::from_string(
+                    "9999999999999999999999999999999999999999999999999999999999999999999999999999",
+                )
+                .unwrap(),
+                26,
+            ),
             (
                 "99999999999999999999999999999999999999999999999999",
                 i256::from_string(
@@ -2300,6 +2556,14 @@ mod tests {
                 .unwrap(),
                 26,
             ),
+            (
+                "9.9999999999999999999999999999999999999999999999999e+49",
+                i256::from_string(
+                    "9999999999999999999999999999999999999999999999999900000000000000000000000000",
+                )
+                .unwrap(),
+                26,
+            ),
         ];
         for (s, i, scale) in edge_tests_256 {
             let result = parse_decimal::<Decimal256Type>(s, 76, scale);