Skip to content

Commit fe922ff

Browse files
committed
Adds lazy reader support for blobs
1 parent 915c83a commit fe922ff

File tree

10 files changed

+289
-12
lines changed

10 files changed

+289
-12
lines changed

src/lazy/binary/raw/value.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ impl<'data> LazyRawBinaryValue<'data> {
406406
fn read_blob(&self) -> ValueParseResult<'data, BinaryEncoding> {
407407
debug_assert!(self.encoded_value.ion_type() == IonType::Blob);
408408
let bytes = self.value_body()?;
409-
Ok(RawValueRef::Blob(bytes))
409+
Ok(RawValueRef::Blob(bytes.into()))
410410
}
411411

412412
/// Helper method called by [`Self::read`]. Reads the current value as a clob.

src/lazy/bytes_ref.rs

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
use crate::text::text_formatter::IonValueFormatter;
2+
use crate::Bytes;
3+
use std::borrow::Cow;
4+
use std::fmt::{Debug, Display, Formatter};
5+
use std::ops::Deref;
6+
7+
pub struct BytesRef<'data> {
8+
data: Cow<'data, [u8]>,
9+
}
10+
11+
impl<'data> Deref for BytesRef<'data> {
12+
type Target = [u8];
13+
14+
fn deref(&self) -> &Self::Target {
15+
self.data.as_ref()
16+
}
17+
}
18+
19+
impl<'data> BytesRef<'data> {
20+
pub fn to_owned(&self) -> Bytes {
21+
Bytes::from(self.as_ref())
22+
}
23+
24+
pub fn into_owned(self) -> Bytes {
25+
Bytes::from(self)
26+
}
27+
28+
pub fn data(&self) -> &[u8] {
29+
self.as_ref()
30+
}
31+
}
32+
33+
impl<'data> From<BytesRef<'data>> for Bytes {
34+
fn from(value: BytesRef<'data>) -> Self {
35+
match value.data {
36+
Cow::Borrowed(bytes) => Bytes::from(bytes),
37+
Cow::Owned(bytes) => Bytes::from(bytes),
38+
}
39+
}
40+
}
41+
42+
impl<'data, const N: usize> From<&'data [u8; N]> for BytesRef<'data> {
43+
fn from(bytes: &'data [u8; N]) -> Self {
44+
BytesRef {
45+
data: Cow::from(bytes.as_ref()),
46+
}
47+
}
48+
}
49+
50+
impl<'data> From<&'data [u8]> for BytesRef<'data> {
51+
fn from(bytes: &'data [u8]) -> Self {
52+
BytesRef {
53+
data: Cow::from(bytes),
54+
}
55+
}
56+
}
57+
58+
impl<'data> From<Vec<u8>> for BytesRef<'data> {
59+
fn from(bytes: Vec<u8>) -> Self {
60+
BytesRef {
61+
data: Cow::from(bytes),
62+
}
63+
}
64+
}
65+
66+
impl<'data> From<&'data str> for BytesRef<'data> {
67+
fn from(text: &'data str) -> Self {
68+
BytesRef {
69+
data: Cow::from(text.as_bytes()),
70+
}
71+
}
72+
}
73+
74+
impl<'data> PartialEq<[u8]> for BytesRef<'data> {
75+
fn eq(&self, other: &[u8]) -> bool {
76+
self.data() == other
77+
}
78+
}
79+
80+
impl<'data> PartialEq<&[u8]> for BytesRef<'data> {
81+
fn eq(&self, other: &&[u8]) -> bool {
82+
self.data() == *other
83+
}
84+
}
85+
86+
impl<'data> PartialEq<BytesRef<'data>> for [u8] {
87+
fn eq(&self, other: &BytesRef<'data>) -> bool {
88+
self == other.data()
89+
}
90+
}
91+
92+
impl<'a, 'b> PartialEq<BytesRef<'a>> for BytesRef<'b> {
93+
fn eq(&self, other: &BytesRef<'a>) -> bool {
94+
self == other.data()
95+
}
96+
}
97+
98+
impl<'data> Display for BytesRef<'data> {
99+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
100+
let mut formatter = IonValueFormatter { output: f };
101+
formatter
102+
.format_blob(self.data())
103+
.map_err(|_| std::fmt::Error)
104+
}
105+
}
106+
107+
impl<'data> Debug for BytesRef<'data> {
108+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
109+
const NUM_BYTES_TO_SHOW: usize = 32;
110+
let data = self.data.as_ref();
111+
// Shows up to the first 32 bytes in hex
112+
write!(f, "BytesRef: [")?;
113+
for byte in data.iter().copied().take(NUM_BYTES_TO_SHOW) {
114+
write!(f, "{:x} ", byte)?;
115+
}
116+
if data.len() > NUM_BYTES_TO_SHOW {
117+
write!(f, "...{} more", (data.len() - NUM_BYTES_TO_SHOW))?;
118+
}
119+
write!(f, "]")?;
120+
121+
Ok(())
122+
}
123+
}

src/lazy/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
//! Provides an ergonomic, lazy view of an Ion stream that permits random access within each
22
//! top level value.
33
4-
mod any_encoding;
4+
pub mod any_encoding;
55
pub mod binary;
6+
pub mod bytes_ref;
67
pub mod decoder;
78
pub(crate) mod encoding;
89
pub mod raw_stream_item;

src/lazy/raw_value_ref.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::lazy::bytes_ref::BytesRef;
12
use crate::lazy::decoder::LazyDecoder;
23
use crate::lazy::str_ref::StrRef;
34
use crate::result::IonFailure;
@@ -18,7 +19,7 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> {
1819
Timestamp(Timestamp),
1920
String(StrRef<'data>),
2021
Symbol(RawSymbolTokenRef<'data>),
21-
Blob(&'data [u8]),
22+
Blob(BytesRef<'data>),
2223
Clob(&'data [u8]),
2324
SExp(D::SExp),
2425
List(D::List),
@@ -140,7 +141,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> {
140141
}
141142
}
142143

143-
pub fn expect_blob(self) -> IonResult<&'data [u8]> {
144+
pub fn expect_blob(self) -> IonResult<BytesRef<'data>> {
144145
if let RawValueRef::Blob(b) = self {
145146
Ok(b)
146147
} else {
@@ -247,7 +248,7 @@ mod tests {
247248
);
248249
assert_eq!(
249250
reader.next()?.expect_value()?.read()?.expect_blob()?,
250-
&[0x06, 0x5A, 0x1B] // Base64-decoded "Blob"
251+
[0x06u8, 0x5A, 0x1B].as_ref() // Base64-decoded "Blob"
251252
);
252253
assert_eq!(
253254
reader.next()?.expect_value()?.read()?.expect_clob()?,

src/lazy/text/buffer.rs

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::str::FromStr;
66

77
use nom::branch::alt;
88
use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1, take_while_m_n};
9-
use nom::character::streaming::{char, digit1, one_of, satisfy};
9+
use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy};
1010
use nom::combinator::{consumed, fail, map, not, opt, peek, recognize, success, value};
1111
use nom::error::{ErrorKind, ParseError};
1212
use nom::multi::{many0_count, many1_count};
@@ -17,8 +17,8 @@ use crate::lazy::encoding::TextEncoding;
1717
use crate::lazy::raw_stream_item::RawStreamItem;
1818
use crate::lazy::text::encoded_value::EncodedTextValue;
1919
use crate::lazy::text::matched::{
20-
MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, MatchedSymbol,
21-
MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
20+
MatchedBlob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString,
21+
MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
2222
};
2323
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError};
2424
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
@@ -497,6 +497,12 @@ impl<'data> TextBufferView<'data> {
497497
)
498498
},
499499
),
500+
map(
501+
match_and_length(Self::match_blob),
502+
|(matched_blob, length)| {
503+
EncodedTextValue::new(MatchedValue::Blob(matched_blob), self.offset(), length)
504+
},
505+
),
500506
map(
501507
match_and_length(Self::match_list),
502508
|(matched_list, length)| {
@@ -1337,6 +1343,28 @@ impl<'data> TextBufferView<'data> {
13371343
recognize(pair(Self::match_any_digit, Self::match_any_digit)),
13381344
)(self)
13391345
}
1346+
1347+
/// Matches a complete blob, including the opening `{{` and closing `}}`.
1348+
pub fn match_blob(self) -> IonParseResult<'data, MatchedBlob> {
1349+
delimited(
1350+
tag("{{"),
1351+
// Only whitespace (not comments) can appear within the blob
1352+
preceded(Self::match_optional_whitespace, Self::match_base64_content),
1353+
preceded(Self::match_optional_whitespace, tag("}}")),
1354+
)
1355+
.map(|base64_data| {
1356+
MatchedBlob::new(base64_data.offset() - self.offset(), base64_data.len())
1357+
})
1358+
.parse(self)
1359+
}
1360+
1361+
/// Matches the base64 content within a blob.
1362+
fn match_base64_content(self) -> IonMatchResult<'data> {
1363+
recognize(terminated(
1364+
many1_count(alt((alphanumeric1, is_a("+/")))),
1365+
opt(alt((tag("=="), tag("=")))),
1366+
))(self)
1367+
}
13401368
}
13411369

13421370
// === nom trait implementations ===
@@ -2002,4 +2030,50 @@ mod tests {
20022030
mismatch_sexp(input);
20032031
}
20042032
}
2033+
2034+
#[test]
2035+
fn test_match_blob() {
2036+
fn match_blob(input: &str) {
2037+
MatchTest::new(input).expect_match(match_length(TextBufferView::match_blob));
2038+
}
2039+
fn mismatch_blob(input: &str) {
2040+
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_blob));
2041+
}
2042+
// Base64 encodings of utf-8 strings
2043+
let good_inputs = &[
2044+
// hello
2045+
"{{aGVsbG8=}}",
2046+
"{{ aGVsbG8=}}",
2047+
"{{aGVsbG8= }}",
2048+
"{{\taGVsbG8=\n\n}}",
2049+
// hello!
2050+
"{{aGVsbG8h}}",
2051+
"{{ aGVsbG8h}}",
2052+
"{{aGVsbG8h }}",
2053+
"{{ aGVsbG8h }}",
2054+
// razzle dazzle root beer
2055+
"{{cmF6emxlIGRhenpsZSByb290IGJlZXI=}}",
2056+
"{{\ncmF6emxlIGRhenpsZSByb290IGJlZXI=\r}}",
2057+
];
2058+
for input in good_inputs {
2059+
match_blob(input);
2060+
}
2061+
2062+
let bad_inputs = &[
2063+
// illegal character $
2064+
"{{$aGVsbG8=}}",
2065+
// comment within braces
2066+
r#"{{
2067+
// Here's the data:
2068+
aGVsbG8=
2069+
}}"#,
2070+
// padding at the beginning
2071+
"{{=aGVsbG8}}",
2072+
// too much padding
2073+
"{{aGVsbG8===}}",
2074+
];
2075+
for input in bad_inputs {
2076+
mismatch_blob(input);
2077+
}
2078+
}
20052079
}

src/lazy/text/encoded_value.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ impl EncodedTextValue {
130130
MatchedValue::Timestamp(_) => IonType::Timestamp,
131131
MatchedValue::String(_) => IonType::String,
132132
MatchedValue::Symbol(_) => IonType::Symbol,
133+
MatchedValue::Blob(_) => IonType::Blob,
133134
MatchedValue::List => IonType::List,
134135
MatchedValue::SExp => IonType::SExp,
135136
MatchedValue::Struct => IonType::Struct,

src/lazy/text/matched.rs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ use num_traits::Num;
3030
use smallvec::SmallVec;
3131

3232
use crate::decimal::coefficient::{Coefficient, Sign};
33+
use crate::lazy::bytes_ref::BytesRef;
3334
use crate::lazy::str_ref::StrRef;
3435
use crate::lazy::text::as_utf8::AsUtf8;
3536
use crate::lazy::text::buffer::TextBufferView;
@@ -52,6 +53,7 @@ pub(crate) enum MatchedValue {
5253
Timestamp(MatchedTimestamp),
5354
String(MatchedString),
5455
Symbol(MatchedSymbol),
56+
Blob(MatchedBlob),
5557
List,
5658
SExp,
5759
Struct,
@@ -750,6 +752,37 @@ impl MatchedHoursAndMinutes {
750752
}
751753
}
752754

755+
#[derive(Clone, Copy, Debug, PartialEq)]
756+
pub struct MatchedBlob {
757+
// Position within the blob at which the base64 characters begin
758+
content_offset: usize,
759+
// Length of the base64 characters
760+
content_length: usize,
761+
}
762+
763+
impl MatchedBlob {
764+
pub fn new(content_offset: usize, content_length: usize) -> Self {
765+
Self {
766+
content_offset,
767+
content_length,
768+
}
769+
}
770+
771+
pub(crate) fn read<'data>(
772+
&self,
773+
matched_input: TextBufferView<'data>,
774+
) -> IonResult<BytesRef<'data>> {
775+
let base64_text = matched_input.slice(self.content_offset, self.content_length);
776+
base64::decode(base64_text.bytes())
777+
.map_err(|e| {
778+
IonError::decoding_error(format!(
779+
"failed to parse blob with invalid base64 data:\n'{base64_text:?}'\n{e:?}:"
780+
))
781+
})
782+
.map(BytesRef::from)
783+
}
784+
}
785+
753786
#[cfg(test)]
754787
mod tests {
755788
use crate::lazy::text::buffer::TextBufferView;
@@ -900,4 +933,37 @@ mod tests {
900933

901934
Ok(())
902935
}
936+
937+
#[test]
938+
fn read_blobs() -> IonResult<()> {
939+
fn expect_blob(data: &str, expected: &str) {
940+
let data = format!("{data} "); // Append a space
941+
let buffer = TextBufferView::new(data.as_bytes());
942+
let (_remaining, matched) = buffer.match_blob().unwrap();
943+
let actual = matched.read(buffer).unwrap();
944+
assert_eq!(
945+
actual,
946+
expected.as_ref(),
947+
"Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}",
948+
data,
949+
actual,
950+
expected
951+
);
952+
}
953+
954+
let tests = [
955+
("{{TWVyY3VyeQ==}}", "Mercury"),
956+
("{{VmVudXM=}}", "Venus"),
957+
("{{RWFydGg=}}", "Earth"),
958+
("{{TWFycw==}}", "Mars"),
959+
("{{ TWFycw== }}", "Mars"),
960+
("{{\nTWFycw==\t\t }}", "Mars"),
961+
];
962+
963+
for (input, expected) in tests {
964+
expect_blob(input, expected);
965+
}
966+
967+
Ok(())
968+
}
903969
}

0 commit comments

Comments
 (0)