-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds support for floats to the LazyRawTextReader
#612
Changes from all commits
e0a83d8
89f79aa
840be4d
5db1ff0
8a39674
2bce25f
2c613d8
e01e20c
7b008b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
use crate::lazy::encoding::TextEncoding; | ||
use crate::lazy::raw_stream_item::RawStreamItem; | ||
use crate::lazy::text::encoded_value::EncodedTextValue; | ||
use crate::lazy::text::matched::{MatchedInt, MatchedValue}; | ||
use crate::lazy::text::matched::{MatchedFloat, MatchedInt, MatchedValue}; | ||
use crate::lazy::text::parse_result::IonParseError; | ||
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; | ||
use crate::lazy::text::value::LazyRawTextValue; | ||
|
@@ -12,7 +12,7 @@ use nom::character::streaming::{char, digit1, one_of}; | |
use nom::combinator::{map, opt, peek, recognize, success, value}; | ||
use nom::error::{ErrorKind, ParseError}; | ||
use nom::multi::many0_count; | ||
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated}; | ||
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; | ||
use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser}; | ||
use std::fmt::{Debug, Formatter}; | ||
use std::iter::{Copied, Enumerate}; | ||
|
@@ -192,6 +192,12 @@ impl<'data> TextBufferView<'data> { | |
EncodedTextValue::new(MatchedValue::Int(matched_int), self.offset(), length) | ||
}, | ||
), | ||
map( | ||
match_and_length(Self::match_float), | ||
|(matched_float, length)| { | ||
EncodedTextValue::new(MatchedValue::Float(matched_float), self.offset(), length) | ||
}, | ||
), | ||
// TODO: The other Ion types | ||
)) | ||
.map(|encoded_value| LazyRawTextValue { | ||
|
@@ -372,6 +378,111 @@ impl<'data> TextBufferView<'data> { | |
fn take_base_16_digits1(self) -> IonMatchResult<'data> { | ||
take_while1(|b: u8| b.is_ascii_hexdigit())(self) | ||
} | ||
|
||
/// Matches an Ion float of any syntax | ||
fn match_float(self) -> IonParseResult<'data, MatchedFloat> { | ||
alt(( | ||
Self::match_float_special_value, | ||
Self::match_float_numeric_value, | ||
))(self) | ||
} | ||
Comment on lines
+383
to
+388
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ This is the entry point for parsing |
||
|
||
/// Matches special IEEE-754 values, including +/- infinity and NaN. | ||
fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> { | ||
alt(( | ||
value(MatchedFloat::NotANumber, tag("nan")), | ||
value(MatchedFloat::PositiveInfinity, tag("+inf")), | ||
value(MatchedFloat::NegativeInfinity, tag("-inf")), | ||
))(self) | ||
} | ||
|
||
/// Matches numeric IEEE-754 floating point values. | ||
fn match_float_numeric_value(self) -> IonParseResult<'data, MatchedFloat> { | ||
terminated( | ||
recognize(pair( | ||
Self::match_number_with_optional_dot_and_digits, | ||
Self::match_float_exponent_marker_and_digits, | ||
)), | ||
Self::peek_stop_character, | ||
) | ||
.map(|_matched| MatchedFloat::Numeric) | ||
.parse(self) | ||
} | ||
|
||
/// Matches a number that may or may not have a decimal place and trailing fractional digits. | ||
/// If a decimal place is present, there must also be trailing digits. | ||
/// For example: | ||
/// 1000 | ||
/// 1000.559 | ||
/// -25.2 | ||
fn match_number_with_optional_dot_and_digits(self) -> IonMatchResult<'data> { | ||
recognize(tuple(( | ||
opt(tag("-")), | ||
Self::match_base_10_digits_before_dot, | ||
opt(Self::match_dot_followed_by_base_10_digits), | ||
)))(self) | ||
} | ||
|
||
/// In a float or decimal, matches the digits that are permitted before the decimal point. | ||
/// This includes either a single zero, or a non-zero followed by any sequence of digits. | ||
fn match_digits_before_dot(self) -> IonMatchResult<'data> { | ||
alt(( | ||
tag("0"), | ||
recognize(pair(Self::match_leading_digit, Self::match_trailing_digits)), | ||
))(self) | ||
} | ||
|
||
/// Matches a single non-zero base 10 digit. | ||
fn match_leading_digit(self) -> IonMatchResult<'data> { | ||
recognize(one_of("123456789"))(self) | ||
} | ||
|
||
/// Matches any number of base 10 digits, allowing underscores at any position except the end. | ||
fn match_trailing_digits(self) -> IonMatchResult<'data> { | ||
recognize(many0_count(preceded(opt(char('_')), digit1)))(self) | ||
} | ||
|
||
/// Recognizes a decimal point followed by any number of base-10 digits. | ||
fn match_dot_followed_by_base_10_digits(self) -> IonMatchResult<'data> { | ||
recognize(preceded(tag("."), opt(Self::match_digits_after_dot)))(self) | ||
} | ||
|
||
/// Like `match_digits_before_dot`, but allows leading zeros. | ||
fn match_digits_after_dot(self) -> IonMatchResult<'data> { | ||
recognize(terminated( | ||
// Zero or more digits-followed-by-underscores | ||
many0_count(pair(digit1, char('_'))), | ||
// One or more digits | ||
digit1, | ||
))(self) | ||
} | ||
|
||
/// Matches an `e` or `E` followed by an optional sign (`+` or `-`) followed by one or more | ||
/// base 10 digits. | ||
fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'data> { | ||
preceded(one_of("eE"), Self::match_exponent_sign_and_digits)(self) | ||
} | ||
|
||
/// Recognizes the exponent portion of a decimal (everything after the 'd') or float | ||
/// (everything after the 'e'). This includes: | ||
/// * an optional '+' OR '-' | ||
/// * any number of decimal digits, which may: | ||
/// * have underscores in between them: `1_000_000` | ||
/// * have one or more leading zeros: `0005` | ||
fn match_exponent_sign_and_digits(self) -> IonMatchResult<'data> { | ||
recognize(pair( | ||
// Optional leading sign; if there's no sign, it's not negative. | ||
opt(Self::match_any_sign), | ||
Self::match_digits_after_dot, | ||
))(self) | ||
} | ||
|
||
/// Matches `-` OR `+`. | ||
/// | ||
/// This is used for matching exponent signs; most places in Ion do not allow `+`. | ||
pub fn match_any_sign(self) -> IonMatchResult<'data> { | ||
alt((tag("+"), tag("-")))(self) | ||
} | ||
} | ||
|
||
// === nom trait implementations === | ||
|
@@ -602,7 +713,12 @@ mod tests { | |
{ | ||
let result = self.try_match(parser); | ||
// We expect this to fail for one reason or another | ||
result.unwrap_err(); | ||
assert!( | ||
result.is_err(), | ||
"Expected a parse failure for input: {:?}\nResult: {:?}", | ||
self.input, | ||
result | ||
); | ||
} | ||
} | ||
|
||
|
@@ -729,4 +845,38 @@ mod tests { | |
mismatch_int(input); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_match_float() { | ||
fn match_float(input: &str) { | ||
MatchTest::new(input).expect_match(match_length(TextBufferView::match_float)); | ||
} | ||
fn mismatch_float(input: &str) { | ||
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_float)); | ||
} | ||
|
||
let good_inputs = &[ | ||
"0.0e0", "0E0", "0e0", "305e1", "305e+1", "305e-1", "305e100", "305e-100", "305e+100", | ||
"305.0e1", "0.279e3", "279e0", "279.5e0", "279.5E0", | ||
]; | ||
for input in good_inputs { | ||
match_float(input); | ||
let negative = format!("-{input}"); | ||
match_float(&negative); | ||
} | ||
|
||
let bad_inputs = &[ | ||
"305", // Integer | ||
"305e", // Has exponent delimiter but no exponent | ||
".305e", // No digits before the decimal point | ||
"305e0.5", // Fractional exponent | ||
"305e-0.5", // Negative fractional exponent | ||
"0305e1", // Leading zero | ||
"+305e1", // Leading plus sign | ||
"--305e1", // Multiple negative signs | ||
]; | ||
for input in bad_inputs { | ||
mismatch_float(input); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,14 +19,17 @@ | |
//! use the previously recorded information to minimize the amount of information that needs to be | ||
//! re-discovered. | ||
|
||
use crate::lazy::text::as_utf8::AsUtf8; | ||
use crate::lazy::text::buffer::TextBufferView; | ||
use crate::result::IonFailure; | ||
use crate::{Int, IonResult, IonType}; | ||
use std::num::IntErrorKind; | ||
|
||
use num_bigint::BigInt; | ||
use num_traits::Num; | ||
use smallvec::SmallVec; | ||
use std::num::IntErrorKind; | ||
|
||
use crate::lazy::text::as_utf8::AsUtf8; | ||
use crate::lazy::text::buffer::TextBufferView; | ||
use crate::lazy::text::parse_result::InvalidInputError; | ||
use crate::result::IonFailure; | ||
use crate::{Int, IonError, IonResult, IonType}; | ||
|
||
/// A partially parsed Ion value. | ||
#[derive(Copy, Clone, Debug, PartialEq)] | ||
|
@@ -35,6 +38,7 @@ pub(crate) enum MatchedValue { | |
Null(IonType), | ||
Bool(bool), | ||
Int(MatchedInt), | ||
Float(MatchedFloat), | ||
// TODO: ...the other types | ||
} | ||
|
||
|
@@ -107,3 +111,46 @@ impl MatchedInt { | |
Ok(int) | ||
} | ||
} | ||
|
||
/// A partially parsed Ion float. | ||
#[derive(Copy, Clone, Debug, PartialEq)] | ||
pub(crate) enum MatchedFloat { | ||
/// `+inf` | ||
PositiveInfinity, | ||
/// `-inf` | ||
NegativeInfinity, | ||
/// `nan` | ||
NotANumber, | ||
/// Any numeric float value | ||
Numeric, | ||
} | ||
Comment on lines
+115
to
+126
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ PR #609 introduced an |
||
|
||
impl MatchedFloat { | ||
// Floats that take more than 32 bytes of text to represent will heap allocate a larger buffer. | ||
const STACK_ALLOC_BUFFER_CAPACITY: usize = 32; | ||
|
||
pub fn read(&self, matched_input: TextBufferView) -> IonResult<f64> { | ||
use std::str::FromStr; | ||
|
||
match self { | ||
MatchedFloat::PositiveInfinity => return Ok(f64::INFINITY), | ||
MatchedFloat::NegativeInfinity => return Ok(f64::NEG_INFINITY), | ||
MatchedFloat::NotANumber => return Ok(f64::NAN), | ||
MatchedFloat::Numeric => {} // fall through | ||
}; | ||
|
||
let mut sanitized: SmallVec<[u8; Self::STACK_ALLOC_BUFFER_CAPACITY]> = | ||
SmallVec::with_capacity(Self::STACK_ALLOC_BUFFER_CAPACITY); | ||
sanitized.extend(matched_input.bytes().iter().copied().filter(|b| *b != b'_')); | ||
|
||
let text = sanitized.as_utf8(matched_input.offset())?; | ||
let float = f64::from_str(text).map_err(|e| { | ||
let error: IonError = InvalidInputError::new(matched_input) | ||
.with_description(format!("encountered an unexpected error ({:?})", e)) | ||
.with_label("parsing a float") | ||
.into(); | ||
error | ||
})?; | ||
Ok(float) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🗺️ When we match a
float
, record information about the match (offset, length, what kind of float) in case we eventually parse it.