Skip to content

Adds lazy reader support for reading annotations #622

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 61 additions & 9 deletions src/lazy/any_encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use std::fmt::Debug;

use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator;
use crate::lazy::binary::raw::r#struct::{
LazyRawBinaryField, LazyRawBinaryStruct, RawBinaryStructIterator,
};
Expand All @@ -22,7 +23,7 @@ use crate::lazy::text::raw::r#struct::{
};
use crate::lazy::text::raw::reader::LazyRawTextReader;
use crate::lazy::text::raw::sequence::{LazyRawTextSequence, RawTextSequenceIterator};
use crate::lazy::text::value::LazyRawTextValue;
use crate::lazy::text::value::{LazyRawTextValue, RawTextAnnotationsIterator};
use crate::{IonResult, IonType, RawSymbolTokenRef};

/// An implementation of the `LazyDecoder` trait that can read either text or binary Ion.
Expand All @@ -37,7 +38,7 @@ impl<'data> LazyDecoder<'data> for AnyEncoding {
type Value = LazyRawAnyValue<'data>;
type Sequence = LazyRawAnySequence<'data>;
type Struct = LazyRawAnyStruct<'data>;
type AnnotationsIterator = Box<dyn Iterator<Item = IonResult<RawSymbolTokenRef<'data>>>>;
type AnnotationsIterator = RawAnyAnnotationsIterator<'data>;
Comment on lines -40 to +41
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ The Box<dyn _> type was just a placeholder.

}

// ===== Readers ======
Expand Down Expand Up @@ -181,10 +182,10 @@ impl<'data> From<RawStreamItem<'data, BinaryEncoding>> for RawStreamItem<'data,
}

impl<'data> LazyRawValuePrivate<'data> for LazyRawAnyValue<'data> {
fn field_name(&self) -> Option<RawSymbolTokenRef<'data>> {
fn field_name(&self) -> IonResult<RawSymbolTokenRef<'data>> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ This trait method signature had to be changed. The binary raw reader only ever encounters SymbolId field names. The text reader, however, can encounter string and symbol field names with invalid text. (For example: illegal escape sequences or invalid unicode.) Thus, we now return an IonResult instead of an Option.

match &self.encoding {
LazyRawValueKind::Text_1_0(v) => v.field_name(),
LazyRawValueKind::Binary_1_0(v) => v.field_name().map(RawSymbolTokenRef::SymbolId),
LazyRawValueKind::Binary_1_0(v) => v.field_name(),
}
}
}
Expand All @@ -204,8 +205,15 @@ impl<'data> LazyRawValue<'data, AnyEncoding> for LazyRawAnyValue<'data> {
}
}

fn annotations(&self) -> <AnyEncoding as LazyDecoder<'data>>::AnnotationsIterator {
todo!()
Comment on lines -207 to -208
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ As with the other LazyRawAnyReader methods, we branch on which encoding we're reading and delegate the method call to the appropriate reader.

fn annotations(&self) -> RawAnyAnnotationsIterator<'data> {
match &self.encoding {
LazyRawValueKind::Text_1_0(v) => RawAnyAnnotationsIterator {
encoding: RawAnnotationsIteratorKind::Text_1_0(v.annotations()),
},
LazyRawValueKind::Binary_1_0(v) => RawAnyAnnotationsIterator {
encoding: RawAnnotationsIteratorKind::Binary_1_0(v.annotations()),
},
}
}

fn read(&self) -> IonResult<RawValueRef<'data, AnyEncoding>> {
Expand All @@ -216,6 +224,28 @@ impl<'data> LazyRawValue<'data, AnyEncoding> for LazyRawAnyValue<'data> {
}
}

// ===== Annotations =====

pub struct RawAnyAnnotationsIterator<'data> {
encoding: RawAnnotationsIteratorKind<'data>,
}

pub enum RawAnnotationsIteratorKind<'data> {
Text_1_0(RawTextAnnotationsIterator<'data>),
Binary_1_0(RawBinaryAnnotationsIterator<'data>),
}

impl<'data> Iterator for RawAnyAnnotationsIterator<'data> {
type Item = IonResult<RawSymbolTokenRef<'data>>;

fn next(&mut self) -> Option<Self::Item> {
match &mut self.encoding {
RawAnnotationsIteratorKind::Text_1_0(i) => i.next(),
RawAnnotationsIteratorKind::Binary_1_0(i) => i.next(),
}
}
}

// ===== Sequences ======

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -422,7 +452,14 @@ impl<'data> LazyRawStruct<'data, AnyEncoding> for LazyRawAnyStruct<'data> {
type Iterator = RawAnyStructIterator<'data>;

fn annotations(&self) -> <AnyEncoding as LazyDecoder<'data>>::AnnotationsIterator {
todo!()
match &self.encoding {
LazyRawStructKind::Text_1_0(s) => RawAnyAnnotationsIterator {
encoding: RawAnnotationsIteratorKind::Text_1_0(s.annotations()),
},
LazyRawStructKind::Binary_1_0(s) => RawAnyAnnotationsIterator {
encoding: RawAnnotationsIteratorKind::Binary_1_0(s.annotations()),
},
}
}

fn find(&self, name: &str) -> IonResult<Option<LazyRawAnyValue<'data>>> {
Expand Down Expand Up @@ -491,13 +528,20 @@ mod tests {
use crate::lazy::decoder::{LazyRawReader, LazyRawSequence, LazyRawValue};
use crate::lazy::raw_stream_item::RawStreamItem;
use crate::lazy::raw_value_ref::RawValueRef;
use crate::IonResult;
use crate::{IonResult, RawSymbolTokenRef};

#[test]
fn any_encoding() -> IonResult<()> {
fn test_input(data: &[u8]) -> IonResult<()> {
let mut reader = LazyRawAnyReader::new(data);
assert_eq!(reader.next()?.expect_ivm()?, (1, 0));
let _strukt = reader.next()?.expect_value()?.read()?.expect_struct()?;
let name = reader.next()?.expect_value()?;
assert_eq!(
name.annotations().next().unwrap()?,
RawSymbolTokenRef::SymbolId(4)
);
assert_eq!(name.read()?.expect_string()?.text(), "Gary");
assert_eq!(
reader.next()?.expect_value()?.read()?,
RawValueRef::String("foo".into())
Expand All @@ -524,7 +568,15 @@ mod tests {
Ok(())
}

let text_data = "$ion_1_0 \"foo\" 5 false [1, 2, 3] ";
let text_data = r#"
$ion_1_0
{$7: ["a", "b", "c"]}
$4::"Gary"
"foo"
5
false
[1, 2, 3]
"#;
let binary_data = to_binary_ion(text_data)?;

test_input(text_data.as_bytes())?;
Expand Down
12 changes: 9 additions & 3 deletions src/lazy/binary/raw/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,14 @@ impl<'a> Debug for LazyRawBinaryValue<'a> {
type ValueParseResult<'data, F> = IonResult<RawValueRef<'data, F>>;

impl<'data> LazyRawValuePrivate<'data> for LazyRawBinaryValue<'data> {
fn field_name(&self) -> Option<RawSymbolTokenRef<'data>> {
self.encoded_value.field_id.map(RawSymbolTokenRef::SymbolId)
fn field_name(&self) -> IonResult<RawSymbolTokenRef<'data>> {
if let Some(field_id) = self.encoded_value.field_id {
Ok(RawSymbolTokenRef::SymbolId(field_id))
} else {
IonResult::illegal_operation(
"requested field name, but value was not in a struct field",
)
}
}
}

Expand Down Expand Up @@ -179,7 +185,7 @@ impl<'data> LazyRawBinaryValue<'data> {

/// If this value is within a struct, returns its associated field name as a `Some(SymbolID)`.
/// Otherwise, returns `None`.
pub(crate) fn field_name(&self) -> Option<SymbolId> {
pub(crate) fn field_id(&self) -> Option<SymbolId> {
Comment on lines -182 to +188
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ I renamed this binary reader method from field_name to field_id to avoid be confused with the raw reader trait of the same name.

The binary reader can always return a symbol ID representing the field name if there is one. However, the raw reader trait needs to accommodate failure cases in the raw text reader and returns an IonResult<RawSymbolTokenRef> instead of an Option<SymbolId>.

self.encoded_value.field_id
}

Expand Down
6 changes: 3 additions & 3 deletions src/lazy/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ pub trait LazyDecoder<'data>: Sized + Debug + Clone {
// function while also preventing users from seeing or depending on it.
pub(crate) mod private {
use super::LazyDecoder;
use crate::RawSymbolTokenRef;
use crate::{IonResult, RawSymbolTokenRef};

pub trait LazyRawFieldPrivate<'data, D: LazyDecoder<'data>> {
/// Converts the `LazyRawField` impl to a `LazyRawValue` impl.
Expand All @@ -49,8 +49,8 @@ pub(crate) mod private {

pub trait LazyRawValuePrivate<'data> {
/// Returns the field name associated with this value. If the value is not inside a struct,
/// returns `None`.
fn field_name(&self) -> Option<RawSymbolTokenRef<'data>>;
/// returns `IllegalOperation`.
fn field_name(&self) -> IonResult<RawSymbolTokenRef<'data>>;
}
}

Expand Down
24 changes: 2 additions & 22 deletions src/lazy/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::marker::PhantomData;

use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator;
use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct;
use crate::lazy::binary::raw::reader::LazyRawBinaryReader;
Expand All @@ -9,8 +7,7 @@ use crate::lazy::decoder::LazyDecoder;
use crate::lazy::text::raw::r#struct::LazyRawTextStruct;
use crate::lazy::text::raw::reader::LazyRawTextReader;
use crate::lazy::text::raw::sequence::LazyRawTextSequence;
use crate::lazy::text::value::LazyRawTextValue;
use crate::{IonResult, RawSymbolTokenRef};
use crate::lazy::text::value::{LazyRawTextValue, RawTextAnnotationsIterator};

// These types derive trait implementations in order to allow types that containing them
// to also derive trait implementations.
Expand All @@ -31,27 +28,10 @@ impl<'data> LazyDecoder<'data> for BinaryEncoding {
type AnnotationsIterator = RawBinaryAnnotationsIterator<'data>;
}

// === Placeholders ===
// The types below will need to be properly defined in order for the lazy text reader to be complete.
// The exist to satisfy various trait definitions.

#[derive(Debug, Clone)]
pub struct ToDoTextAnnotationsIterator<'data> {
spooky: &'data PhantomData<()>,
}

impl<'data> Iterator for ToDoTextAnnotationsIterator<'data> {
type Item = IonResult<RawSymbolTokenRef<'data>>;

fn next(&mut self) -> Option<Self::Item> {
todo!()
}
}

impl<'data> LazyDecoder<'data> for TextEncoding {
type Reader = LazyRawTextReader<'data>;
type Value = LazyRawTextValue<'data>;
type Sequence = LazyRawTextSequence<'data>;
type Struct = LazyRawTextStruct<'data>;
type AnnotationsIterator = ToDoTextAnnotationsIterator<'data>;
type AnnotationsIterator = RawTextAnnotationsIterator<'data>;
}
2 changes: 1 addition & 1 deletion src/lazy/raw_value_ref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> {
if let RawValueRef::Int(i) = self {
i.expect_i64()
} else {
IonResult::decoding_error("expected an i64 (int)")
IonResult::decoding_error(format!("expected an i64 (int), found: {:?}", self))
}
}

Expand Down
70 changes: 65 additions & 5 deletions src/lazy/text/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1};
use nom::character::streaming::{char, digit1, one_of, satisfy};
use nom::combinator::{fail, map, not, opt, peek, recognize, success, value};
use nom::error::{ErrorKind, ParseError};
use nom::multi::many0_count;
use nom::multi::{many0_count, many1_count};
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser};

Expand Down Expand Up @@ -251,6 +251,19 @@ impl<'data> TextBufferView<'data> {
))
}

/// Matches one or more annotations.
pub fn match_annotations(self) -> IonMatchResult<'data> {
recognize(many1_count(Self::match_annotation))(self)
}

/// Matches an annotation (symbol token) and a terminating '::'.
pub fn match_annotation(self) -> IonParseResult<'data, (MatchedSymbol, Range<usize>)> {
terminated(
whitespace_and_then(match_and_span(Self::match_symbol)),
whitespace_and_then(tag("::")),
)(self)
}

/// Matches a single value in a list OR the end of the list, allowing for leading whitespace
/// and comments in either case.
///
Expand All @@ -265,7 +278,7 @@ impl<'data> TextBufferView<'data> {
value(None, tag("]")),
// ...or a value...
terminated(
Self::match_value.map(Some),
Self::match_annotated_value.map(Some),
// ...followed by a comma or end-of-list
Self::match_delimiter_after_list_value,
),
Expand Down Expand Up @@ -317,7 +330,7 @@ impl<'data> TextBufferView<'data> {
separated_pair(
whitespace_and_then(match_and_span(Self::match_struct_field_name)),
whitespace_and_then(tag(":")),
whitespace_and_then(Self::match_value),
whitespace_and_then(Self::match_annotated_value),
),
whitespace_and_then(alt((tag(","), peek(tag("}"))))),
)(self)
Expand Down Expand Up @@ -369,10 +382,29 @@ impl<'data> TextBufferView<'data> {
/// Matches a single value at the top level. The caller must verify that the input is not an
/// IVM before calling; otherwise, that IVM will be recognized as an identifier/symbol.
fn match_top_level_value(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> {
self.match_value()
self.match_annotated_value()
.map(|(remaining, value)| (remaining, RawStreamItem::Value(value)))
}

/// Matches an optional annotation sequence and a trailing value.
pub fn match_annotated_value(self) -> IonParseResult<'data, LazyRawTextValue<'data>> {
pair(
opt(Self::match_annotations),
whitespace_and_then(Self::match_value),
)
.map(|(maybe_annotations, mut value)| {
if let Some(annotations) = maybe_annotations {
value.encoded_value = value
.encoded_value
.with_annotations_sequence(annotations.offset(), annotations.len());
// Rewind the value's input to include the annotations sequence.
value.input = self.slice_to_end(annotations.offset() - self.offset());
}
value
})
.parse(self)
}

/// Matches a single scalar value or the beginning of a container.
pub fn match_value(self) -> IonParseResult<'data, LazyRawTextValue<'data>> {
alt((
Expand Down Expand Up @@ -1128,7 +1160,7 @@ where

/// Augments a given parser such that it returns the matched value and the range of input bytes
/// that it matched.
fn match_and_span<'data, P, O>(
pub(crate) fn match_and_span<'data, P, O>(
mut parser: P,
) -> impl Parser<TextBufferView<'data>, (O, Range<usize>), IonParseError<'data>>
where
Expand Down Expand Up @@ -1477,4 +1509,32 @@ mod tests {
mismatch_symbol(input);
}
}

#[test]
fn test_match_annotated_value() {
fn match_annotated_value(input: &str) {
MatchTest::new(input).expect_match(match_length(TextBufferView::match_annotated_value));
}
fn mismatch_annotated_value(input: &str) {
MatchTest::new(input)
.expect_mismatch(match_length(TextBufferView::match_annotated_value));
}
let good_inputs = &[
"foo::5",
"foo::bar::5",
"foo :: 5",
"foo::bar::baz::5",
"foo :: /*comment*/ bar /*comment*/ :: baz :: 5",
"foo::bar::baz::quux::quuz::5",
"foo::'bar'::baz::$10::5",
];
for input in good_inputs {
match_annotated_value(input);
}

let bad_inputs = &["foo", "foo:bar", "foo:::bar"];
for input in bad_inputs {
mismatch_annotated_value(input);
}
}
}
21 changes: 13 additions & 8 deletions src/lazy/text/encoded_value.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::lazy::text::buffer::TextBufferView;
use crate::lazy::text::matched::{MatchedSymbol, MatchedValue};
use crate::result::IonFailure;
use crate::{IonResult, IonType};
use crate::{IonResult, IonType, RawSymbolTokenRef};
use std::ops::Range;

/// Represents the type, offset, and length metadata of the various components of an encoded value
Expand Down Expand Up @@ -149,15 +149,20 @@ impl EncodedTextValue {
self.data_offset..(self.data_offset + self.data_length)
}

pub fn field_name<'data>(&self, input: TextBufferView<'data>) -> IonResult<&'data str> {
if self.field_name_offset == 0 {
return IonResult::illegal_operation(
pub fn field_name<'data>(
&self,
input: TextBufferView<'data>,
) -> IonResult<RawSymbolTokenRef<'data>> {
if let Some(field_name_syntax) = self.field_name_syntax() {
let relative_start =
self.data_offset - input.offset() - (self.field_name_offset as usize);
let field_name_bytes = input.slice(relative_start, self.field_name_length as usize);
field_name_syntax.read(field_name_bytes)
} else {
IonResult::illegal_operation(
"requested field name, but value was not in a struct field",
);
)
}
let relative_start = self.data_offset - input.offset() - (self.field_name_offset as usize);
let field_name_bytes = input.slice(relative_start, self.field_name_length as usize);
field_name_bytes.as_text()
}

pub fn field_name_range(&self) -> Option<Range<usize>> {
Expand Down
Loading