Skip to content

Commit f728e08

Browse files
authored
Adds lazy reader support for blobs (#629)
1 parent c288e6b commit f728e08

File tree

10 files changed

+328
-14
lines changed

10 files changed

+328
-14
lines changed

src/lazy/binary/raw/value.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ impl<'data> LazyRawBinaryValue<'data> {
406406
fn read_blob(&self) -> ValueParseResult<'data, BinaryEncoding> {
407407
debug_assert!(self.encoded_value.ion_type() == IonType::Blob);
408408
let bytes = self.value_body()?;
409-
Ok(RawValueRef::Blob(bytes))
409+
Ok(RawValueRef::Blob(bytes.into()))
410410
}
411411

412412
/// Helper method called by [`Self::read`]. Reads the current value as a clob.

src/lazy/bytes_ref.rs

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
use crate::text::text_formatter::IonValueFormatter;
2+
use crate::Bytes;
3+
use std::borrow::Cow;
4+
use std::fmt::{Debug, Display, Formatter};
5+
use std::ops::Deref;
6+
7+
pub struct BytesRef<'data> {
8+
data: Cow<'data, [u8]>,
9+
}
10+
11+
impl<'data> Deref for BytesRef<'data> {
12+
type Target = [u8];
13+
14+
fn deref(&self) -> &Self::Target {
15+
self.data.as_ref()
16+
}
17+
}
18+
19+
impl<'data> BytesRef<'data> {
20+
pub fn to_owned(&self) -> Bytes {
21+
Bytes::from(self.as_ref())
22+
}
23+
24+
pub fn into_owned(self) -> Bytes {
25+
Bytes::from(self)
26+
}
27+
28+
pub fn data(&self) -> &[u8] {
29+
self.as_ref()
30+
}
31+
}
32+
33+
impl<'data> From<BytesRef<'data>> for Bytes {
34+
fn from(value: BytesRef<'data>) -> Self {
35+
match value.data {
36+
Cow::Borrowed(bytes) => Bytes::from(bytes),
37+
Cow::Owned(bytes) => Bytes::from(bytes),
38+
}
39+
}
40+
}
41+
42+
impl<'data, const N: usize> From<&'data [u8; N]> for BytesRef<'data> {
43+
fn from(bytes: &'data [u8; N]) -> Self {
44+
BytesRef {
45+
data: Cow::from(bytes.as_ref()),
46+
}
47+
}
48+
}
49+
50+
impl<'data> From<&'data [u8]> for BytesRef<'data> {
51+
fn from(bytes: &'data [u8]) -> Self {
52+
BytesRef {
53+
data: Cow::from(bytes),
54+
}
55+
}
56+
}
57+
58+
impl<'data> From<Vec<u8>> for BytesRef<'data> {
59+
fn from(bytes: Vec<u8>) -> Self {
60+
BytesRef {
61+
data: Cow::from(bytes),
62+
}
63+
}
64+
}
65+
66+
impl<'data> From<&'data str> for BytesRef<'data> {
67+
fn from(text: &'data str) -> Self {
68+
BytesRef {
69+
data: Cow::from(text.as_bytes()),
70+
}
71+
}
72+
}
73+
74+
impl<'data> PartialEq<[u8]> for BytesRef<'data> {
75+
fn eq(&self, other: &[u8]) -> bool {
76+
self.data() == other
77+
}
78+
}
79+
80+
impl<'data> PartialEq<&[u8]> for BytesRef<'data> {
81+
fn eq(&self, other: &&[u8]) -> bool {
82+
self.data() == *other
83+
}
84+
}
85+
86+
impl<'data> PartialEq<BytesRef<'data>> for [u8] {
87+
fn eq(&self, other: &BytesRef<'data>) -> bool {
88+
self == other.data()
89+
}
90+
}
91+
92+
impl<'a, 'b> PartialEq<BytesRef<'a>> for BytesRef<'b> {
93+
fn eq(&self, other: &BytesRef<'a>) -> bool {
94+
self == other.data()
95+
}
96+
}
97+
98+
impl<'data> Display for BytesRef<'data> {
99+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
100+
let mut formatter = IonValueFormatter { output: f };
101+
formatter
102+
.format_blob(self.data())
103+
.map_err(|_| std::fmt::Error)
104+
}
105+
}
106+
107+
impl<'data> Debug for BytesRef<'data> {
108+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
109+
const NUM_BYTES_TO_SHOW: usize = 32;
110+
let data = self.data.as_ref();
111+
// Shows up to the first 32 bytes in hex
112+
write!(f, "BytesRef: [")?;
113+
for byte in data.iter().copied().take(NUM_BYTES_TO_SHOW) {
114+
write!(f, "{:x} ", byte)?;
115+
}
116+
if data.len() > NUM_BYTES_TO_SHOW {
117+
write!(f, "...{} more", (data.len() - NUM_BYTES_TO_SHOW))?;
118+
}
119+
write!(f, "]")?;
120+
121+
Ok(())
122+
}
123+
}

src/lazy/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
//! Provides an ergonomic, lazy view of an Ion stream that permits random access within each
22
//! top level value.
33
4-
mod any_encoding;
4+
pub mod any_encoding;
55
pub mod binary;
6+
pub mod bytes_ref;
67
pub mod decoder;
78
pub(crate) mod encoding;
89
pub mod raw_stream_item;

src/lazy/raw_value_ref.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::lazy::bytes_ref::BytesRef;
12
use crate::lazy::decoder::LazyDecoder;
23
use crate::lazy::str_ref::StrRef;
34
use crate::result::IonFailure;
@@ -18,7 +19,7 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> {
1819
Timestamp(Timestamp),
1920
String(StrRef<'data>),
2021
Symbol(RawSymbolTokenRef<'data>),
21-
Blob(&'data [u8]),
22+
Blob(BytesRef<'data>),
2223
Clob(&'data [u8]),
2324
SExp(D::SExp),
2425
List(D::List),
@@ -140,7 +141,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> {
140141
}
141142
}
142143

143-
pub fn expect_blob(self) -> IonResult<&'data [u8]> {
144+
pub fn expect_blob(self) -> IonResult<BytesRef<'data>> {
144145
if let RawValueRef::Blob(b) = self {
145146
Ok(b)
146147
} else {
@@ -247,7 +248,7 @@ mod tests {
247248
);
248249
assert_eq!(
249250
reader.next()?.expect_value()?.read()?.expect_blob()?,
250-
&[0x06, 0x5A, 0x1B] // Base64-decoded "Blob"
251+
[0x06u8, 0x5A, 0x1B].as_ref() // Base64-decoded "Blob"
251252
);
252253
assert_eq!(
253254
reader.next()?.expect_value()?.read()?.expect_clob()?,

src/lazy/text/buffer.rs

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::str::FromStr;
66

77
use nom::branch::alt;
88
use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1, take_while_m_n};
9-
use nom::character::streaming::{char, digit1, one_of, satisfy};
9+
use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy};
1010
use nom::combinator::{consumed, fail, map, not, opt, peek, recognize, success, value};
1111
use nom::error::{ErrorKind, ParseError};
1212
use nom::multi::{many0_count, many1_count};
@@ -17,8 +17,8 @@ use crate::lazy::encoding::TextEncoding;
1717
use crate::lazy::raw_stream_item::RawStreamItem;
1818
use crate::lazy::text::encoded_value::EncodedTextValue;
1919
use crate::lazy::text::matched::{
20-
MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, MatchedSymbol,
21-
MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
20+
MatchedBlob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString,
21+
MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
2222
};
2323
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError};
2424
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
@@ -497,6 +497,12 @@ impl<'data> TextBufferView<'data> {
497497
)
498498
},
499499
),
500+
map(
501+
match_and_length(Self::match_blob),
502+
|(matched_blob, length)| {
503+
EncodedTextValue::new(MatchedValue::Blob(matched_blob), self.offset(), length)
504+
},
505+
),
500506
map(
501507
match_and_length(Self::match_list),
502508
|(matched_list, length)| {
@@ -1341,6 +1347,36 @@ impl<'data> TextBufferView<'data> {
13411347
recognize(pair(one_of("012345"), Self::match_any_digit)),
13421348
)(self)
13431349
}
1350+
1351+
/// Matches a complete blob, including the opening `{{` and closing `}}`.
1352+
pub fn match_blob(self) -> IonParseResult<'data, MatchedBlob> {
1353+
delimited(
1354+
tag("{{"),
1355+
// Only whitespace (not comments) can appear within the blob
1356+
recognize(Self::match_base64_content),
1357+
preceded(Self::match_optional_whitespace, tag("}}")),
1358+
)
1359+
.map(|base64_data| {
1360+
MatchedBlob::new(base64_data.offset() - self.offset(), base64_data.len())
1361+
})
1362+
.parse(self)
1363+
}
1364+
1365+
/// Matches the base64 content within a blob. Ion allows the base64 content to be broken up with
1366+
/// whitespace, so the matched input region may need to be stripped of whitespace before
1367+
/// the data can be decoded.
1368+
fn match_base64_content(self) -> IonMatchResult<'data> {
1369+
recognize(terminated(
1370+
many0_count(preceded(
1371+
Self::match_optional_whitespace,
1372+
alt((alphanumeric1, is_a("+/"))),
1373+
)),
1374+
opt(preceded(
1375+
Self::match_optional_whitespace,
1376+
alt((tag("=="), tag("="))),
1377+
)),
1378+
))(self)
1379+
}
13441380
}
13451381

13461382
// === nom trait implementations ===
@@ -2008,4 +2044,59 @@ mod tests {
20082044
mismatch_sexp(input);
20092045
}
20102046
}
2047+
2048+
#[test]
2049+
fn test_match_blob() {
2050+
fn match_blob(input: &str) {
2051+
MatchTest::new(input).expect_match(match_length(TextBufferView::match_blob));
2052+
}
2053+
fn mismatch_blob(input: &str) {
2054+
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_blob));
2055+
}
2056+
// Base64 encodings of utf-8 strings
2057+
let good_inputs = &[
2058+
// <empty blobs>
2059+
"{{}}",
2060+
"{{ }}",
2061+
"{{\n\t}}",
2062+
// hello
2063+
"{{aGVsbG8=}}",
2064+
"{{ aGVsbG8=}}",
2065+
"{{aGVsbG8= }}",
2066+
"{{\taGVsbG8=\n\n}}",
2067+
"{{aG Vs bG 8 =}}",
2068+
r#"{{
2069+
aG Vs
2070+
bG 8=
2071+
}}"#,
2072+
// hello!
2073+
"{{aGVsbG8h}}",
2074+
"{{ aGVsbG8h}}",
2075+
"{{aGVsbG8h }}",
2076+
"{{ aGVsbG8h }}",
2077+
// razzle dazzle root beer
2078+
"{{cmF6emxlIGRhenpsZSByb290IGJlZXI=}}",
2079+
"{{\ncmF6emxlIGRhenpsZSByb290IGJlZXI=\r}}",
2080+
];
2081+
for input in good_inputs {
2082+
match_blob(input);
2083+
}
2084+
2085+
let bad_inputs = &[
2086+
// illegal character $
2087+
"{{$aGVsbG8=}}",
2088+
// comment within braces
2089+
r#"{{
2090+
// Here's the data:
2091+
aGVsbG8=
2092+
}}"#,
2093+
// padding at the beginning
2094+
"{{=aGVsbG8}}",
2095+
// too much padding
2096+
"{{aGVsbG8===}}",
2097+
];
2098+
for input in bad_inputs {
2099+
mismatch_blob(input);
2100+
}
2101+
}
20112102
}

src/lazy/text/encoded_value.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ impl EncodedTextValue {
130130
MatchedValue::Timestamp(_) => IonType::Timestamp,
131131
MatchedValue::String(_) => IonType::String,
132132
MatchedValue::Symbol(_) => IonType::Symbol,
133+
MatchedValue::Blob(_) => IonType::Blob,
133134
MatchedValue::List => IonType::List,
134135
MatchedValue::SExp => IonType::SExp,
135136
MatchedValue::Struct => IonType::Struct,

0 commit comments

Comments
 (0)