-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds lazy reader support for reading clobs #638
Changes from 36 commits
e0a83d8
89f79aa
840be4d
5db1ff0
07d4a70
181e0a5
357ca8f
716ff34
e29fec5
8f79a36
4cb9b2b
54470d2
78014e7
a6a3aa8
4fc9078
11174ac
719dbaa
f603872
4696ca5
c7129ac
44435ea
d50e05b
8283422
0f01099
b60f1fe
915c83a
fe922ff
066ddd8
c58e5f0
6b5ce1c
e45ec35
a3f8a21
62be7c9
0eacd3a
175009d
3421393
45cbf40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
use crate::lazy::bytes_ref::BytesRef; | ||
use crate::text::text_formatter::IonValueFormatter; | ||
use crate::Str; | ||
use std::borrow::Cow; | ||
|
@@ -80,3 +81,12 @@ impl<'data> From<StrRef<'data>> for Str { | |
Str::from(text) | ||
} | ||
} | ||
|
||
impl<'data> From<StrRef<'data>> for BytesRef<'data> { | ||
fn from(value: StrRef<'data>) -> Self { | ||
match value.text { | ||
Cow::Borrowed(text) => text.as_bytes().into(), | ||
Cow::Owned(text) => Vec::from(text).into(), | ||
} | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ This impl converts a |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,8 +17,8 @@ use crate::lazy::encoding::TextEncoding; | |
use crate::lazy::raw_stream_item::RawStreamItem; | ||
use crate::lazy::text::encoded_value::EncodedTextValue; | ||
use crate::lazy::text::matched::{ | ||
MatchedBlob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, | ||
MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, | ||
MatchedBlob, MatchedClob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, | ||
MatchedString, MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, | ||
}; | ||
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; | ||
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; | ||
|
@@ -508,6 +508,12 @@ impl<'data> TextBufferView<'data> { | |
EncodedTextValue::new(MatchedValue::Blob(matched_blob), self.offset(), length) | ||
}, | ||
), | ||
map( | ||
match_and_length(Self::match_clob), | ||
|(matched_clob, length)| { | ||
EncodedTextValue::new(MatchedValue::Clob(matched_clob), self.offset(), length) | ||
}, | ||
), | ||
map( | ||
match_and_length(Self::match_list), | ||
|(matched_list, length)| { | ||
|
@@ -983,12 +989,12 @@ impl<'data> TextBufferView<'data> { | |
} | ||
|
||
/// Matches short- or long-form string. | ||
fn match_string(self) -> IonParseResult<'data, MatchedString> { | ||
pub fn match_string(self) -> IonParseResult<'data, MatchedString> { | ||
alt((Self::match_short_string, Self::match_long_string))(self) | ||
} | ||
|
||
/// Matches a short string. For example: `"foo"` | ||
fn match_short_string(self) -> IonParseResult<'data, MatchedString> { | ||
pub(crate) fn match_short_string(self) -> IonParseResult<'data, MatchedString> { | ||
delimited(char('"'), Self::match_short_string_body, char('"')) | ||
.map(|(_matched, contains_escaped_chars)| { | ||
if contains_escaped_chars { | ||
|
@@ -1002,13 +1008,13 @@ impl<'data> TextBufferView<'data> { | |
|
||
/// Returns a matched buffer and a boolean indicating whether any escaped characters were | ||
/// found in the short string. | ||
fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { | ||
pub(crate) fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ The clob reading logic re-uses the short- and long-form string matchers to isolate the content within the larger match. |
||
Self::match_text_until_unescaped(self, b'\"') | ||
} | ||
|
||
/// Matches a long string comprised of any number of `'''`-enclosed segments interleaved | ||
/// with optional comments and whitespace. | ||
pub fn match_long_string(self) -> IonParseResult<'data, MatchedString> { | ||
pub(crate) fn match_long_string(self) -> IonParseResult<'data, MatchedString> { | ||
fold_many1( | ||
// Parser to keep applying repeatedly | ||
whitespace_and_then(Self::match_long_string_segment), | ||
|
@@ -1169,6 +1175,13 @@ impl<'data> TextBufferView<'data> { | |
contains_escaped_chars = true; | ||
continue; | ||
} | ||
if *byte == b'\r' { | ||
// If the text contains an unescaped carriage return, we may need to normalize it. | ||
// In some narrow cases, setting this flag to true may result in a sanitization buffer | ||
// being allocated when it isn't strictly necessary. | ||
contains_escaped_chars = true; | ||
continue; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ In long-form clobs and long-form strings, we need to normalize unescaped There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
if *byte == delimiter { | ||
let matched = self.slice(0, index); | ||
let remaining = self.slice_to_end(index); | ||
|
@@ -1435,6 +1448,66 @@ impl<'data> TextBufferView<'data> { | |
.parse(self) | ||
} | ||
|
||
/// Matches a clob of either short- or long-form syntax. | ||
pub fn match_clob(self) -> IonParseResult<'data, MatchedClob> { | ||
delimited( | ||
tag("{{"), | ||
preceded( | ||
Self::match_optional_whitespace, | ||
alt(( | ||
value(MatchedClob::Short, Self::match_short_clob_body), | ||
value(MatchedClob::Long, Self::match_long_clob_body), | ||
)), | ||
), | ||
preceded(Self::match_optional_whitespace, tag("}}")), | ||
)(self) | ||
} | ||
|
||
/// Matches the body (inside the `{{` and `}}`) of a short-form clob. | ||
fn match_short_clob_body(self) -> IonMatchResult<'data> { | ||
let (remaining, (body, _matched_string)) = consumed(Self::match_short_string)(self)?; | ||
body.validate_clob_text()?; | ||
Ok((remaining, body)) | ||
} | ||
|
||
/// Matches the body (inside the `{{` and `}}`) of a long-form clob. | ||
fn match_long_clob_body(self) -> IonMatchResult<'data> { | ||
recognize(many1_count(preceded( | ||
Self::match_optional_whitespace, | ||
Self::match_long_clob_body_segment, | ||
)))(self) | ||
} | ||
|
||
/// Matches a single segment of a long-form clob's content. | ||
fn match_long_clob_body_segment(self) -> IonMatchResult<'data> { | ||
let (remaining, (body, _matched_string)) = consumed(Self::match_long_string_segment)(self)?; | ||
body.validate_clob_text()?; | ||
Ok((remaining, body)) | ||
} | ||
|
||
/// Returns an error if the buffer contains any byte that is not legal inside a clob. | ||
fn validate_clob_text(self) -> IonMatchResult<'data> { | ||
for byte in self.bytes().iter().copied() { | ||
if !Self::byte_is_legal_clob_ascii(byte) { | ||
let message = format!("found an illegal byte '{:0x}'in clob", byte); | ||
let error = InvalidInputError::new(self).with_description(message); | ||
return Err(nom::Err::Failure(IonParseError::Invalid(error))); | ||
} | ||
} | ||
// Return success without consuming | ||
Ok((self, self.slice(0, 0))) | ||
} | ||
|
||
/// Returns `false` if the specified byte cannot appear unescaped in a clob. | ||
fn byte_is_legal_clob_ascii(b: u8) -> bool { | ||
// Depending on where you look in the spec and/or `ion-tests`, you'll find conflicting | ||
// information about which ASCII characters can appear unescaped in a clob. Some say | ||
// "characters >= 0x20", but that excludes lots of whitespace characters that are < 0x20. | ||
// Some say "displayable ASCII", but DEL (0x7F) is shown to be legal in one of the ion-tests. | ||
// The definition used here has largely been inferred from the contents of `ion-tests`. | ||
b.is_ascii() | ||
&& (u32::from(b) >= 0x20 || WHITESPACE_CHARACTERS_AS_STR.as_bytes().contains(&b)) | ||
} | ||
/// Matches the base64 content within a blob. Ion allows the base64 content to be broken up with | ||
/// whitespace, so the matched input region may need to be stripped of whitespace before | ||
/// the data can be decoded. | ||
|
@@ -2189,6 +2262,48 @@ mod tests { | |
} | ||
|
||
#[test] | ||
fn test_match_clob() { | ||
fn match_clob(input: &str) { | ||
MatchTest::new(input).expect_match(match_length(TextBufferView::match_clob)); | ||
} | ||
fn mismatch_blob(input: &str) { | ||
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_clob)); | ||
} | ||
// Base64 encodings of utf-8 strings | ||
let good_inputs = &[ | ||
r#"{{""}}"#, | ||
r#"{{''''''}}"#, | ||
r#"{{"foo"}}"#, | ||
r#"{{ "foo"}}"#, | ||
r#"{{ "foo" }}"#, | ||
r#"{{"foo" }}"#, | ||
r#"{{'''foo'''}}"#, | ||
r#"{{"foobar"}}"#, | ||
r#"{{'''foo''' '''bar'''}}"#, | ||
r#"{{ | ||
'''foo''' | ||
'''bar''' | ||
'''baz''' | ||
}}"#, | ||
]; | ||
for input in good_inputs { | ||
match_clob(input); | ||
} | ||
|
||
let bad_inputs = &[ | ||
r#"{{foo}}"#, // No quotes | ||
r#"{{"foo}}"#, // Missing closing quote | ||
r#"{{"foo"}"#, // Missing closing brace | ||
r#"{{'''foo'''}"#, // Missing closing brace | ||
r#"{{'''foo''' /*hi!*/ '''bar'''}}"#, // Interleaved comments | ||
r#"{{'''foo''' "bar"}}"#, // Mixed quote style | ||
r#"{{"😎🙂🙃"}}"#, // Contains unescaped non-ascii characters | ||
]; | ||
for input in bad_inputs { | ||
mismatch_blob(input); | ||
} | ||
} | ||
|
||
fn test_match_text_until_unescaped_str() { | ||
let input = TextBufferView::new(r" foo bar \''' baz''' quux ".as_bytes()); | ||
let (_remaining, (matched, contains_escapes)) = | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🗺️ Reading a
clob
now returns aBytesRef<'_>
instead of a&[u8]
to accommodate the escape decoding process that happens in text clobs. This change mirrors the one made for blobs in #629.