Skip to content

Adds lazy reader support for decimals #628

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e0a83d8
Top-level nulls, bools, ints
zslayton Jul 16, 2023
89f79aa
Consolidate impls of AsUtf8 w/helper fn
zslayton Jul 25, 2023
840be4d
Improved TextBufferView docs, removed DataSource
zslayton Jul 25, 2023
5db1ff0
Adds lazy text floats
zslayton Jul 27, 2023
07d4a70
Adds LazyRawTextReader support for comments
zslayton Jul 27, 2023
181e0a5
Adds LazyRawTextReader support for reading strings
zslayton Jul 28, 2023
357ca8f
clippy fixes
zslayton Jul 28, 2023
716ff34
Fix a couple of unit tests
zslayton Jul 29, 2023
e29fec5
Less ambitious float eq comparison
zslayton Jul 29, 2023
8f79a36
Adds LazyRawTextReader support for reading symbols
zslayton Aug 1, 2023
4cb9b2b
Adds more doc comments
zslayton Aug 1, 2023
54470d2
More doc comments
zslayton Aug 1, 2023
78014e7
Adds `LazyRawTextReader` support for reading lists
zslayton Aug 3, 2023
a6a3aa8
Adds `LazyRawTextReader` support for structs
zslayton Aug 10, 2023
4fc9078
More doc comments
zslayton Aug 10, 2023
11174ac
Adds `LazyRawTextReader` support for reading IVMs
zslayton Aug 10, 2023
719dbaa
Initial impl of a LazyRawAnyReader
zslayton Aug 11, 2023
f603872
Improved comments.
zslayton Aug 11, 2023
4696ca5
Adds LazyRawTextReader support for annotations
zslayton Aug 11, 2023
c7129ac
Adds lazy reader support for timestamps
zslayton Aug 14, 2023
44435ea
Lazy reader support for s-expressions
zslayton Aug 18, 2023
d50e05b
Fixed doc comments
zslayton Aug 18, 2023
8283422
Fix internal doc link
zslayton Aug 18, 2023
0f01099
Adds lazy reader support for decimals
zslayton Aug 19, 2023
b60f1fe
Fixed bad unit test example case
zslayton Aug 20, 2023
915c83a
clippy fixes
zslayton Aug 20, 2023
4b53bb3
Merge remote-tracking branch 'origin/main' into lazy-timestamps
zslayton Aug 23, 2023
60d5a17
Incorporates review feedback
zslayton Aug 23, 2023
db9718d
Matcher recognizes +00:00 as Zulu
zslayton Aug 23, 2023
37264a3
Merge remote-tracking branch 'origin/lazy-timestamps' into lazy-sexps
zslayton Aug 23, 2023
74b8baf
Merge remote-tracking branch 'origin/lazy-sexps' into lazy-decimals
zslayton Aug 23, 2023
f82bf6d
Merge remote-tracking branch 'origin/main' into lazy-decimals
zslayton Sep 1, 2023
e94e99a
Replace now-deprecated chrono method calls
zslayton Sep 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 113 additions & 10 deletions src/lazy/text/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::str::FromStr;
use nom::branch::alt;
use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1, take_while_m_n};
use nom::character::streaming::{char, digit1, one_of, satisfy};
use nom::combinator::{fail, map, not, opt, peek, recognize, success, value};
use nom::combinator::{consumed, fail, map, not, opt, peek, recognize, success, value};
use nom::error::{ErrorKind, ParseError};
use nom::multi::{many0_count, many1_count};
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
Expand All @@ -17,7 +17,7 @@ use crate::lazy::encoding::TextEncoding;
use crate::lazy::raw_stream_item::RawStreamItem;
use crate::lazy::text::encoded_value::EncodedTextValue;
use crate::lazy::text::matched::{
MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, MatchedSymbol,
MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, MatchedSymbol,
MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
};
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError};
Expand Down Expand Up @@ -457,6 +457,16 @@ impl<'data> TextBufferView<'data> {
EncodedTextValue::new(MatchedValue::Float(matched_float), self.offset(), length)
},
),
map(
match_and_length(Self::match_decimal),
|(matched_decimal, length)| {
EncodedTextValue::new(
MatchedValue::Decimal(matched_decimal),
self.offset(),
length,
)
},
),
map(
match_and_length(Self::match_timestamp),
|(matched_timestamp, length)| {
Expand Down Expand Up @@ -867,28 +877,98 @@ impl<'data> TextBufferView<'data> {
/// Matches an `e` or `E` followed by an optional sign (`+` or `-`) followed by one or more
/// base 10 digits.
fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'data> {
preceded(one_of("eE"), Self::match_exponent_sign_and_digits)(self)
preceded(
one_of("eE"),
recognize(Self::match_exponent_sign_and_digits),
)(self)
}

/// Recognizes the exponent portion of a decimal (everything after the 'd') or float
/// Matches the exponent portion of a decimal (everything after the 'd') or float
/// (everything after the 'e'). This includes:
/// * an optional '+' OR '-'
/// * any number of decimal digits, which may:
/// * have underscores in between them: `1_000_000`
/// * have one or more leading zeros: `0005`
fn match_exponent_sign_and_digits(self) -> IonMatchResult<'data> {
recognize(pair(
///
/// Returns a boolean indicating whether the sign was negative (vs absent or positive)
/// and the buffer slice containing the digits.
fn match_exponent_sign_and_digits(self) -> IonParseResult<'data, (bool, Self)> {
pair(
// Optional leading sign; if there's no sign, it's not negative.
opt(Self::match_any_sign),
opt(Self::match_any_sign).map(|s| s == Some('-')),
Self::match_digits_after_dot,
))(self)
)(self)
}

/// Matches `-` OR `+`.
///
/// This is used for matching exponent signs; most places in Ion do not allow `+`.
pub fn match_any_sign(self) -> IonMatchResult<'data> {
alt((tag("+"), tag("-")))(self)
pub fn match_any_sign(self) -> IonParseResult<'data, char> {
one_of("-+")(self)
}

pub fn match_decimal_exponent(self) -> IonParseResult<'data, (bool, TextBufferView<'data>)> {
preceded(one_of("dD"), Self::match_exponent_sign_and_digits)(self)
}

/// Match an optional sign (if present), digits before the decimal point, then digits after the
/// decimal point (if present).
pub fn match_decimal(self) -> IonParseResult<'data, MatchedDecimal> {
tuple((
opt(tag("-")),
Self::match_digits_before_dot,
alt((
// Either a decimal point and digits and optional d/D and exponent
preceded(
tag("."),
pair(
alt((Self::match_digits_after_dot, Self::match_nothing)),
opt(Self::match_decimal_exponent),
),
)
.map(|(digits_after_dot, maybe_exponent)| {
let (exp_is_negative, exp_digits) = match maybe_exponent {
Some(exponent) => exponent,
None => (false, digits_after_dot.slice(digits_after_dot.len(), 0)),
};
(digits_after_dot, exp_is_negative, exp_digits)
}),
// or just a d/D and exponent
consumed(Self::match_decimal_exponent).map(
|(matched, (exp_is_negative, exp_digits))| {
// Make an empty slice to represent the (absent) digits after dot
let digits_after_dot = matched.slice(0, 0);
(digits_after_dot, exp_is_negative, exp_digits)
},
),
)),
))
.map(
|(maybe_sign, leading_digits, (digits_after_dot, exponent_is_negative, exp_digits))| {
let is_negative = maybe_sign.is_some();
let digits_offset = (leading_digits.offset() - self.offset()) as u16;
let digits_length = match digits_after_dot.len() {
0 => leading_digits.len() as u16,
trailing_digits_length => {
// The `+1` accounts for the decimal point
(leading_digits.len() + 1 + trailing_digits_length) as u16
}
};
let trailing_digits_length = digits_after_dot.len() as u16;
let exponent_digits_offset = (exp_digits.offset() - self.offset()) as u16;
let exponent_digits_length = exp_digits.len() as u16;
MatchedDecimal::new(
is_negative,
digits_offset,
digits_length,
trailing_digits_length,
exponent_is_negative,
exponent_digits_offset,
exponent_digits_length,
)
},
)
.parse(self)
}

/// Matches short- or long-form string.
Expand Down Expand Up @@ -1876,6 +1956,29 @@ mod tests {
}
}

#[test]
fn test_match_decimal() {
fn match_decimal(input: &str) {
MatchTest::new(input).expect_match(match_length(TextBufferView::match_decimal));
}
fn mismatch_decimal(input: &str) {
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_decimal));
}
let good_inputs = &[
"5.", "-5.", "5.0", "-5.0", "5.0d0", "-5.0d0", "5.0D0", "-5.0D0", "5.0d+1", "-5.0d-1",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add 5d0 and 5.d0 (basically, decimals that have an exponent but no fractional digits)?

];
for input in good_inputs {
match_decimal(input);
}

let bad_inputs = &[
"5", "5d", "05d", "-5d", "5.d", "-5.d", "5.D", "-5.D", "-5.0+0",
];
for input in bad_inputs {
mismatch_decimal(input);
}
}

#[test]
fn test_match_sexp() {
fn match_sexp(input: &str) {
Expand Down
1 change: 1 addition & 0 deletions src/lazy/text/encoded_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ impl EncodedTextValue {
MatchedValue::Bool(_) => IonType::Bool,
MatchedValue::Int(_) => IonType::Int,
MatchedValue::Float(_) => IonType::Float,
MatchedValue::Decimal(_) => IonType::Decimal,
MatchedValue::Timestamp(_) => IonType::Timestamp,
MatchedValue::String(_) => IonType::String,
MatchedValue::Symbol(_) => IonType::Symbol,
Expand Down
157 changes: 155 additions & 2 deletions src/lazy/text/matched.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,25 @@
//! use the previously recorded information to minimize the amount of information that needs to be
//! re-discovered.

use nom::character::is_hex_digit;
use nom::AsChar;
use std::borrow::Cow;
use std::num::IntErrorKind;
use std::str::FromStr;

use nom::character::is_hex_digit;
use nom::AsChar;
use num_bigint::{BigInt, BigUint};
use num_traits::Num;
use smallvec::SmallVec;

use crate::decimal::coefficient::{Coefficient, Sign};
use crate::lazy::str_ref::StrRef;
use crate::lazy::text::as_utf8::AsUtf8;
use crate::lazy::text::buffer::TextBufferView;
use crate::lazy::text::parse_result::InvalidInputError;
use crate::result::{DecodingError, IonFailure};
use crate::{
Decimal, Int, IonError, IonResult, IonType, RawSymbolTokenRef, Timestamp, TimestampPrecision,
UInt,
};

/// A partially parsed Ion value.
Expand All @@ -46,6 +48,7 @@ pub(crate) enum MatchedValue {
Bool(bool),
Int(MatchedInt),
Float(MatchedFloat),
Decimal(MatchedDecimal),
Timestamp(MatchedTimestamp),
String(MatchedString),
Symbol(MatchedSymbol),
Expand Down Expand Up @@ -167,6 +170,112 @@ impl MatchedFloat {
}
}

#[derive(Copy, Clone, Debug, PartialEq)]
pub(crate) struct MatchedDecimal {
is_negative: bool,
digits_offset: u16,
digits_length: u16,
trailing_digits_length: u16,
exponent_is_negative: bool,
exponent_digits_offset: u16,
exponent_digits_length: u16,
}

impl MatchedDecimal {
// Decimals that take more than 32 bytes of text to represent will heap allocate a larger buffer.
const STACK_ALLOC_BUFFER_CAPACITY: usize = 32;

pub fn new(
is_negative: bool,
digits_offset: u16,
digits_length: u16,
trailing_digits_length: u16,
exponent_is_negative: bool,
exponent_offset: u16,
exponent_length: u16,
) -> Self {
Self {
is_negative,
digits_offset,
digits_length,
trailing_digits_length,
exponent_is_negative,
exponent_digits_offset: exponent_offset,
exponent_digits_length: exponent_length,
}
}

pub fn read(&self, matched_input: TextBufferView) -> IonResult<Decimal> {
// The longest number that can fit into a u64 without finer-grained bounds checks.
const MAX_U64_DIGITS: usize = 19;
// u64::MAX is a 20-digit number starting with `1`. For simplicity, we'll turn any number
// with 19 or fewer digits into a u64 and anything else into a BigUint.

let mut sanitized: SmallVec<[u8; Self::STACK_ALLOC_BUFFER_CAPACITY]> =
SmallVec::with_capacity(Self::STACK_ALLOC_BUFFER_CAPACITY);

let digits = matched_input.slice(self.digits_offset as usize, self.digits_length as usize);

// Copy all of the digits (but not the decimal point or underscores) over to the buffer.
sanitized.extend(
digits
.bytes()
.iter()
.copied()
.filter(|b| b.is_ascii_digit()),
);

let digits_text = sanitized.as_utf8(digits.offset())?;
let magnitude: UInt = if sanitized.len() <= MAX_U64_DIGITS {
u64::from_str(digits_text).unwrap().into()
} else {
BigUint::from_str(digits_text).unwrap().into()
};

let sign = if self.is_negative {
Sign::Negative
} else {
Sign::Positive
};
let coefficient = Coefficient::new(sign, magnitude);

let mut exponent: i64 = match self.exponent_digits_length {
0 => 0,
_ => {
sanitized.clear();
let exponent_digits = matched_input.slice(
self.exponent_digits_offset as usize,
self.exponent_digits_length as usize,
);
// Copy all of the digits over to the buffer.
sanitized.extend(
exponent_digits
.bytes()
.iter()
.copied()
.filter(|b| b.is_ascii_digit()),
);
let exponent_text = sanitized
.as_utf8(matched_input.offset() + self.exponent_digits_offset as usize)?;
let exponent_magnitude = i64::from_str(exponent_text).map_err(|e| {
IonError::decoding_error(format!(
"failed to parse decimal exponent '{exponent_text}': {e:?}"
))
})?;
if self.exponent_is_negative {
-exponent_magnitude
} else {
exponent_magnitude
}
}
};

exponent -= self.trailing_digits_length as i64;

Ok(Decimal::new(coefficient, exponent))
}
}

#[derive(Clone, Copy, Debug, PartialEq)]
pub(crate) enum MatchedString {
/// The string only has one segment. (e.g. "foo")
Expand Down Expand Up @@ -734,4 +843,48 @@ mod tests {

Ok(())
}

#[test]
fn read_decimals() -> IonResult<()> {
fn expect_decimal(data: &str, expected: Decimal) {
let data = format!("{data} "); // Append a space
let buffer = TextBufferView::new(data.as_bytes());
let (_remaining, matched) = buffer.match_decimal().unwrap();
let actual = matched.read(buffer).unwrap();
assert_eq!(
actual, expected,
"Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}",
data, actual, expected
);
}

let tests = [
("0.", Decimal::new(0, 0)),
("-0.", Decimal::negative_zero()),
("5.", Decimal::new(5, 0)),
("-5.", Decimal::new(-5, 0)),
("5.d0", Decimal::new(5, 0)),
("-5.d0", Decimal::new(-5, 0)),
("5.0", Decimal::new(50, -1)),
("-5.0", Decimal::new(-50, -1)),
("5.0d", Decimal::new(50, -1)),
("-5.0d", Decimal::new(-50, -1)),
("500d0", Decimal::new(5, 2)),
("-500d0", Decimal::new(-5, 2)),
("0.005", Decimal::new(5, -3)),
("-0.005", Decimal::new(-5, -3)),
("0.005D2", Decimal::new(5, -1)),
("-0.005D2", Decimal::new(-5, -1)),
("0.005d+2", Decimal::new(5, -1)),
("-0.005d+2", Decimal::new(-5, -1)),
("0.005D-2", Decimal::new(5, -5)),
("-0.005D-2", Decimal::new(-5, -5)),
];

for (input, expected) in tests {
expect_decimal(input, expected);
}

Ok(())
}
}
Loading