Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support for floats to the LazyRawTextReader #612

Merged
merged 9 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 153 additions & 3 deletions src/lazy/text/buffer.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::lazy::encoding::TextEncoding;
use crate::lazy::raw_stream_item::RawStreamItem;
use crate::lazy::text::encoded_value::EncodedTextValue;
use crate::lazy::text::matched::{MatchedInt, MatchedValue};
use crate::lazy::text::matched::{MatchedFloat, MatchedInt, MatchedValue};
use crate::lazy::text::parse_result::IonParseError;
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
use crate::lazy::text::value::LazyRawTextValue;
Expand All @@ -12,7 +12,7 @@ use nom::character::streaming::{char, digit1, one_of};
use nom::combinator::{map, opt, peek, recognize, success, value};
use nom::error::{ErrorKind, ParseError};
use nom::multi::many0_count;
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated};
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser};
use std::fmt::{Debug, Formatter};
use std::iter::{Copied, Enumerate};
Expand Down Expand Up @@ -192,6 +192,12 @@ impl<'data> TextBufferView<'data> {
EncodedTextValue::new(MatchedValue::Int(matched_int), self.offset(), length)
},
),
map(
match_and_length(Self::match_float),
|(matched_float, length)| {
EncodedTextValue::new(MatchedValue::Float(matched_float), self.offset(), length)
},
Comment on lines +197 to +199
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ When we match a float, record information about the match (offset, length, what kind of float) in case we eventually parse it.

),
// TODO: The other Ion types
))
.map(|encoded_value| LazyRawTextValue {
Expand Down Expand Up @@ -372,6 +378,111 @@ impl<'data> TextBufferView<'data> {
fn take_base_16_digits1(self) -> IonMatchResult<'data> {
take_while1(|b: u8| b.is_ascii_hexdigit())(self)
}

/// Matches an Ion float of any syntax
fn match_float(self) -> IonParseResult<'data, MatchedFloat> {
alt((
Self::match_float_special_value,
Self::match_float_numeric_value,
))(self)
}
Comment on lines +383 to +388
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ This is the entry point for parsingfloats, which have two kinds of syntax: keyword special values (+inf, -inf, nan), and numeric values. Numeric values come in a big variety of shapes.


/// Matches special IEEE-754 values, including +/- infinity and NaN.
fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> {
alt((
value(MatchedFloat::NotANumber, tag("nan")),
value(MatchedFloat::PositiveInfinity, tag("+inf")),
value(MatchedFloat::NegativeInfinity, tag("-inf")),
))(self)
}

/// Matches numeric IEEE-754 floating point values.
fn match_float_numeric_value(self) -> IonParseResult<'data, MatchedFloat> {
terminated(
recognize(pair(
Self::match_number_with_optional_dot_and_digits,
Self::match_float_exponent_marker_and_digits,
)),
Self::peek_stop_character,
)
.map(|_matched| MatchedFloat::Numeric)
.parse(self)
}

/// Matches a number that may or may not have a decimal place and trailing fractional digits.
/// If a decimal place is present, there must also be trailing digits.
/// For example:
/// 1000
/// 1000.559
/// -25.2
fn match_number_with_optional_dot_and_digits(self) -> IonMatchResult<'data> {
recognize(tuple((
opt(tag("-")),
Self::match_base_10_digits_before_dot,
opt(Self::match_dot_followed_by_base_10_digits),
)))(self)
}

/// In a float or decimal, matches the digits that are permitted before the decimal point.
/// This includes either a single zero, or a non-zero followed by any sequence of digits.
fn match_digits_before_dot(self) -> IonMatchResult<'data> {
alt((
tag("0"),
recognize(pair(Self::match_leading_digit, Self::match_trailing_digits)),
))(self)
}

/// Matches a single non-zero base 10 digit.
fn match_leading_digit(self) -> IonMatchResult<'data> {
recognize(one_of("123456789"))(self)
}

/// Matches any number of base 10 digits, allowing underscores at any position except the end.
fn match_trailing_digits(self) -> IonMatchResult<'data> {
recognize(many0_count(preceded(opt(char('_')), digit1)))(self)
}

/// Recognizes a decimal point followed by any number of base-10 digits.
fn match_dot_followed_by_base_10_digits(self) -> IonMatchResult<'data> {
recognize(preceded(tag("."), opt(Self::match_digits_after_dot)))(self)
}

/// Like `match_digits_before_dot`, but allows leading zeros.
fn match_digits_after_dot(self) -> IonMatchResult<'data> {
recognize(terminated(
// Zero or more digits-followed-by-underscores
many0_count(pair(digit1, char('_'))),
// One or more digits
digit1,
))(self)
}

/// Matches an `e` or `E` followed by an optional sign (`+` or `-`) followed by one or more
/// base 10 digits.
fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'data> {
preceded(one_of("eE"), Self::match_exponent_sign_and_digits)(self)
}

/// Recognizes the exponent portion of a decimal (everything after the 'd') or float
/// (everything after the 'e'). This includes:
/// * an optional '+' OR '-'
/// * any number of decimal digits, which may:
/// * have underscores in between them: `1_000_000`
/// * have one or more leading zeros: `0005`
fn match_exponent_sign_and_digits(self) -> IonMatchResult<'data> {
recognize(pair(
// Optional leading sign; if there's no sign, it's not negative.
opt(Self::match_any_sign),
Self::match_digits_after_dot,
))(self)
}

/// Matches `-` OR `+`.
///
/// This is used for matching exponent signs; most places in Ion do not allow `+`.
pub fn match_any_sign(self) -> IonMatchResult<'data> {
alt((tag("+"), tag("-")))(self)
}
}

// === nom trait implementations ===
Expand Down Expand Up @@ -602,7 +713,12 @@ mod tests {
{
let result = self.try_match(parser);
// We expect this to fail for one reason or another
result.unwrap_err();
assert!(
result.is_err(),
"Expected a parse failure for input: {:?}\nResult: {:?}",
self.input,
result
);
}
}

Expand Down Expand Up @@ -729,4 +845,38 @@ mod tests {
mismatch_int(input);
}
}

#[test]
fn test_match_float() {
fn match_float(input: &str) {
MatchTest::new(input).expect_match(match_length(TextBufferView::match_float));
}
fn mismatch_float(input: &str) {
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_float));
}

let good_inputs = &[
"0.0e0", "0E0", "0e0", "305e1", "305e+1", "305e-1", "305e100", "305e-100", "305e+100",
"305.0e1", "0.279e3", "279e0", "279.5e0", "279.5E0",
];
for input in good_inputs {
match_float(input);
let negative = format!("-{input}");
match_float(&negative);
}

let bad_inputs = &[
"305", // Integer
"305e", // Has exponent delimiter but no exponent
".305e", // No digits before the decimal point
"305e0.5", // Fractional exponent
"305e-0.5", // Negative fractional exponent
"0305e1", // Leading zero
"+305e1", // Leading plus sign
"--305e1", // Multiple negative signs
];
for input in bad_inputs {
mismatch_float(input);
}
}
}
1 change: 1 addition & 0 deletions src/lazy/text/encoded_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ impl EncodedTextValue {
MatchedValue::Null(ion_type) => ion_type,
MatchedValue::Bool(_) => IonType::Bool,
MatchedValue::Int(_) => IonType::Int,
MatchedValue::Float(_) => IonType::Float,
}
}

Expand Down
57 changes: 52 additions & 5 deletions src/lazy/text/matched.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@
//! use the previously recorded information to minimize the amount of information that needs to be
//! re-discovered.

use crate::lazy::text::as_utf8::AsUtf8;
use crate::lazy::text::buffer::TextBufferView;
use crate::result::IonFailure;
use crate::{Int, IonResult, IonType};
use std::num::IntErrorKind;

use num_bigint::BigInt;
use num_traits::Num;
use smallvec::SmallVec;
use std::num::IntErrorKind;

use crate::lazy::text::as_utf8::AsUtf8;
use crate::lazy::text::buffer::TextBufferView;
use crate::lazy::text::parse_result::InvalidInputError;
use crate::result::IonFailure;
use crate::{Int, IonError, IonResult, IonType};

/// A partially parsed Ion value.
#[derive(Copy, Clone, Debug, PartialEq)]
Expand All @@ -35,6 +38,7 @@ pub(crate) enum MatchedValue {
Null(IonType),
Bool(bool),
Int(MatchedInt),
Float(MatchedFloat),
// TODO: ...the other types
}

Expand Down Expand Up @@ -107,3 +111,46 @@ impl MatchedInt {
Ok(int)
}
}

/// A partially parsed Ion float.
#[derive(Copy, Clone, Debug, PartialEq)]
pub(crate) enum MatchedFloat {
/// `+inf`
PositiveInfinity,
/// `-inf`
NegativeInfinity,
/// `nan`
NotANumber,
/// Any numeric float value
Numeric,
}
Comment on lines +115 to +126
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ PR #609 introduced an EncodedValue struct that stores information like offset and length, but which also has a MatchedValue enum field. This is one of the variants of MatchedValue, and remembers the kind of float we encountered in case we read it later. The offset and length will be available in the parent data structure.


impl MatchedFloat {
// Floats that take more than 32 bytes of text to represent will heap allocate a larger buffer.
const STACK_ALLOC_BUFFER_CAPACITY: usize = 32;

pub fn read(&self, matched_input: TextBufferView) -> IonResult<f64> {
use std::str::FromStr;

match self {
MatchedFloat::PositiveInfinity => return Ok(f64::INFINITY),
MatchedFloat::NegativeInfinity => return Ok(f64::NEG_INFINITY),
MatchedFloat::NotANumber => return Ok(f64::NAN),
MatchedFloat::Numeric => {} // fall through
};

let mut sanitized: SmallVec<[u8; Self::STACK_ALLOC_BUFFER_CAPACITY]> =
SmallVec::with_capacity(Self::STACK_ALLOC_BUFFER_CAPACITY);
sanitized.extend(matched_input.bytes().iter().copied().filter(|b| *b != b'_'));

let text = sanitized.as_utf8(matched_input.offset())?;
let float = f64::from_str(text).map_err(|e| {
let error: IonError = InvalidInputError::new(matched_input)
.with_description(format!("encountered an unexpected error ({:?})", e))
.with_label("parsing a float")
.into();
error
})?;
Ok(float)
}
}
68 changes: 45 additions & 23 deletions src/lazy/text/parse_result.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,25 @@ impl<'data> From<InvalidInputError<'data>> for IonParseError<'data> {
}
}

// We cannot provide an analogous impl for `Incomplete` because it is missing necessary data.
impl<'data> From<InvalidInputError<'data>> for IonError {
fn from(invalid_input_error: InvalidInputError) -> Self {
let mut message = String::from(
invalid_input_error
.description()
.unwrap_or("invalid Ion syntax encountered"),
);
if let Some(label) = invalid_input_error.label {
message.push_str(" while ");
message.push_str(label.as_ref());
}
let position = Position::with_offset(invalid_input_error.input.offset())
.with_length(invalid_input_error.input.len());
let decoding_error = DecodingError::new(message).with_position(position);
IonError::Decoding(decoding_error)
}
}

impl<'data> From<nom::Err<IonParseError<'data>>> for IonParseError<'data> {
fn from(value: Err<IonParseError<'data>>) -> Self {
match value {
Expand Down Expand Up @@ -200,6 +219,31 @@ pub(crate) trait AddContext<'data, T> {
) -> IonResult<(TextBufferView<'data>, T)>;
}

impl<'data, T> AddContext<'data, T> for nom::Err<IonParseError<'data>> {
fn with_context(
self,
label: impl Into<Cow<'static, str>>,
input: TextBufferView<'data>,
) -> IonResult<(TextBufferView<'data>, T)> {
let ipe = IonParseError::from(self);
ipe.with_context(label, input)
}
}

// Turns an IonParseError into an IonResult
impl<'data, T> AddContext<'data, T> for IonParseError<'data> {
fn with_context(
self,
label: impl Into<Cow<'static, str>>,
input: TextBufferView<'data>,
) -> IonResult<(TextBufferView<'data>, T)> {
match self {
IonParseError::Incomplete => IonResult::incomplete(label, input.offset()),
IonParseError::Invalid(invalid_input_error) => Err(IonError::from(invalid_input_error)),
}
}
}

impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> {
fn with_context(
self,
Expand All @@ -209,29 +253,7 @@ impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> {
match self {
// No change needed in the ok case
Ok(matched) => Ok(matched),
// If the error was an incomplete
Err(e) => {
// Nom error to IonParseError
match IonParseError::from(e) {
IonParseError::Incomplete => IonResult::incomplete(label, input.offset()),
IonParseError::Invalid(invalid_input_error) => {
dbg!(&invalid_input_error.backtrace);
let mut message = String::from(
invalid_input_error
.description()
.unwrap_or("invalid text Ion syntax"),
);
if let Some(label) = invalid_input_error.label {
message.push_str(" while ");
message.push_str(label.as_ref());
}
let position = Position::with_offset(invalid_input_error.input.offset())
.with_length(invalid_input_error.input.len());
let decoding_error = DecodingError::new(message).with_position(position);
Err(IonError::Decoding(decoding_error))
}
}
}
Err(e) => e.with_context(label, input),
}
}
}
Expand Down
Loading
Loading