diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index afa00f176293..3102fcc58ada 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -671,6 +671,126 @@ impl Parser for Date64Type { } } +fn parse_e_notation( + s: &str, + mut digits: u16, + mut fractionals: i16, + mut result: T::Native, + index: usize, + precision: u16, + scale: i16, +) -> Result { + let mut exp: i16 = 0; + let base = T::Native::usize_as(10); + + let mut exp_start: bool = false; + // e has a plus sign + let mut pos_shift_direction: bool = true; + + // skip to point or exponent index + let mut bs; + if fractionals > 0 { + // it's a fraction, so the point index needs to be skipped, so +1 + bs = s.as_bytes().iter().skip(index + fractionals as usize + 1); + } else { + // it's actually an integer that is already written into the result, so let's skip on to e + bs = s.as_bytes().iter().skip(index); + } + + while let Some(b) = bs.next() { + match b { + b'0'..=b'9' => { + result = result.mul_wrapping(base); + result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + if fractionals > 0 { + fractionals += 1; + } + digits += 1; + } + &b'e' | &b'E' => { + exp_start = true; + } + _ => { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + }; + + if exp_start { + pos_shift_direction = match bs.next() { + Some(&b'-') => false, + Some(&b'+') => true, + Some(b) => { + if !b.is_ascii_digit() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + + exp *= 10; + exp += (b - b'0') as i16; + + true + } + None => { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))) + } + }; + + for b in bs.by_ref() { + if !b.is_ascii_digit() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + exp *= 10; + exp += (b - b'0') as i16; + } + } + } + + if digits == 0 && fractionals == 0 && exp == 0 { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + + if !pos_shift_direction { + // exponent has a large negative sign + // 1.12345e-30 => 0.0{29}12345, scale = 5 + if exp - (digits as i16 + scale) > 0 { + return Ok(T::Native::usize_as(0)); + } + exp *= -1; + } + + // point offset + exp = fractionals - exp; + // We have zeros on the left, we need to count them + if !pos_shift_direction && exp > digits as i16 { + digits = exp as u16; + } + // Number of numbers to be removed or added + exp = scale - exp; + + if (digits as i16 + exp) as u16 > precision { + return Err(ArrowError::ParseError(format!( + "parse decimal overflow ({s})" + ))); + } + + if exp < 0 { + result = result.div_wrapping(base.pow_wrapping(-exp as _)); + } else { + result = result.mul_wrapping(base.pow_wrapping(exp as _)); + } + + Ok(result) +} + /// Parse the string format decimal value to i128/i256 format and checking the precision and scale. /// The result value can't be out of bounds. pub fn parse_decimal( @@ -679,8 +799,8 @@ pub fn parse_decimal( scale: i8, ) -> Result { let mut result = T::Native::usize_as(0); - let mut fractionals = 0; - let mut digits = 0; + let mut fractionals: i8 = 0; + let mut digits: u8 = 0; let base = T::Native::usize_as(10); let bs = s.as_bytes(); @@ -696,10 +816,13 @@ pub fn parse_decimal( ))); } - let mut bs = bs.iter(); + let mut bs = bs.iter().enumerate(); + + let mut is_e_notation = false; + // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. // Thus, if we validate the precision correctly, we can skip overflow checks. - while let Some(b) = bs.next() { + while let Some((index, b)) = bs.next() { match b { b'0'..=b'9' => { if digits == 0 && *b == b'0' { @@ -711,8 +834,28 @@ pub fn parse_decimal( result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); } b'.' => { - for b in bs.by_ref() { + let point_index = index; + + for (_, b) in bs.by_ref() { if !b.is_ascii_digit() { + if *b == b'e' || *b == b'E' { + result = match parse_e_notation::( + s, + digits as u16, + fractionals as i16, + result, + point_index, + precision as u16, + scale as i16, + ) { + Err(e) => return Err(e), + Ok(v) => v, + }; + + is_e_notation = true; + + break; + } return Err(ArrowError::ParseError(format!( "can't parse the string value {s} to decimal" ))); @@ -729,6 +872,10 @@ pub fn parse_decimal( result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); } + if is_e_notation { + break; + } + // Fail on "." if digits == 0 { return Err(ArrowError::ParseError(format!( @@ -736,6 +883,24 @@ pub fn parse_decimal( ))); } } + b'e' | b'E' => { + result = match parse_e_notation::( + s, + digits as u16, + fractionals as i16, + result, + index, + precision as u16, + scale as i16, + ) { + Err(e) => return Err(e), + Ok(v) => v, + }; + + is_e_notation = true; + + break; + } _ => { return Err(ArrowError::ParseError(format!( "can't parse the string value {s} to decimal" @@ -744,15 +909,21 @@ pub fn parse_decimal( } } - if fractionals < scale { - let exp = scale - fractionals; - if exp as u8 + digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + if !is_e_notation { + if fractionals < scale { + let exp = scale - fractionals; + if exp as u8 + digits > precision { + return Err(ArrowError::ParseError(format!( + "parse decimal overflow ({s})" + ))); + } + let mul = base.pow_wrapping(exp as _); + result = result.mul_wrapping(mul); + } else if digits > precision { + return Err(ArrowError::ParseError(format!( + "parse decimal overflow ({s})" + ))); } - let mul = base.pow_wrapping(exp as _); - result = result.mul_wrapping(mul); - } else if digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); } Ok(if negative { @@ -2202,7 +2373,65 @@ mod tests { let result_256 = parse_decimal::(s, 20, 3); assert_eq!(i256::from_i128(i), result_256.unwrap()); } - let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; + + let e_notation_tests = [ + ("1.23e3", "1230.0", 2), + ("5.6714e+2", "567.14", 4), + ("5.6714e-2", "0.056714", 4), + ("5.6714e-2", "0.056714", 3), + ("5.6741214125e2", "567.41214125", 4), + ("8.91E4", "89100.0", 2), + ("3.14E+5", "314000.0", 2), + ("2.718e0", "2.718", 2), + ("9.999999e-1", "0.9999999", 4), + ("1.23e+3", "1230", 2), + ("1.234559e+3", "1234.559", 2), + ("1.00E-10", "0.0000000001", 11), + ("1.23e-4", "0.000123", 2), + ("9.876e7", "98760000.0", 2), + ("5.432E+8", "543200000.0", 10), + ("1.234567e9", "1234567000.0", 2), + ("1.234567e2", "123.45670000", 2), + ("4749.3e-5", "0.047493", 10), + ("4749.3e+5", "474930000", 10), + ("4749.3e-5", "0.047493", 1), + ("4749.3e+5", "474930000", 1), + ("0E-8", "0", 10), + ("0E+6", "0", 10), + ("1E-8", "0.00000001", 10), + ("12E+6", "12000000", 10), + ("12E-6", "0.000012", 10), + ("0.1e-6", "0.0000001", 10), + ("0.1e+6", "100000", 10), + ("0.12e-6", "0.00000012", 10), + ("0.12e+6", "120000", 10), + ("000000000001e0", "000000000001", 3), + ("000001.1034567002e0", "000001.1034567002", 3), + ]; + for (e, d, scale) in e_notation_tests { + let result_128_e = parse_decimal::(e, 20, scale); + let result_128_d = parse_decimal::(d, 20, scale); + assert_eq!(result_128_e.unwrap(), result_128_d.unwrap()); + let result_256_e = parse_decimal::(e, 20, scale); + let result_256_d = parse_decimal::(d, 20, scale); + assert_eq!(result_256_e.unwrap(), result_256_d.unwrap()); + } + let can_not_parse_tests = [ + "123,123", + ".", + "123.123.123", + "", + "+", + "-", + "e", + "1.3e+e3", + "5.6714ee-2", + "4.11ee-+4", + "4.11e++4", + "1.1e.12", + "1.23e+3.", + "1.23e+3.1", + ]; for s in can_not_parse_tests { let result_128 = parse_decimal::(s, 20, 3); assert_eq!( @@ -2215,7 +2444,16 @@ mod tests { result_256.unwrap_err().to_string() ); } - let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; + let overflow_parse_tests = [ + "12345678", + "1.2345678e7", + "12345678.9", + "1.23456789e+7", + "99999999.99", + "9.999999999e7", + "12345678908765.123456", + "123456789087651234.56e-4", + ]; for s in overflow_parse_tests { let result_128 = parse_decimal::(s, 10, 3); let expected_128 = "Parser error: parse decimal overflow"; @@ -2262,6 +2500,16 @@ mod tests { 99999999999999999999999999999999999999i128, 38, ), + ( + "0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001016744", + 0i128, + 15, + ), + ( + "1.016744e-320", + 0i128, + 15, + ), ]; for (s, i, scale) in edge_tests_128 { let result_128 = parse_decimal::(s, 38, scale); @@ -2292,6 +2540,14 @@ mod tests { .unwrap(), 26, ), + ( + "9.999999999999999999999999999999999999999999999999999999999999999999999999999e49", + i256::from_string( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", + ) + .unwrap(), + 26, + ), ( "99999999999999999999999999999999999999999999999999", i256::from_string( @@ -2300,6 +2556,14 @@ mod tests { .unwrap(), 26, ), + ( + "9.9999999999999999999999999999999999999999999999999e+49", + i256::from_string( + "9999999999999999999999999999999999999999999999999900000000000000000000000000", + ) + .unwrap(), + 26, + ), ]; for (s, i, scale) in edge_tests_256 { let result = parse_decimal::(s, 76, scale);