Skip to content

Commit

Permalink
Adds lazy reader support for blobs
Browse files Browse the repository at this point in the history
  • Loading branch information
zslayton committed Aug 20, 2023
1 parent 915c83a commit fe922ff
Show file tree
Hide file tree
Showing 10 changed files with 289 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/lazy/binary/raw/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ impl<'data> LazyRawBinaryValue<'data> {
fn read_blob(&self) -> ValueParseResult<'data, BinaryEncoding> {
debug_assert!(self.encoded_value.ion_type() == IonType::Blob);
let bytes = self.value_body()?;
Ok(RawValueRef::Blob(bytes))
Ok(RawValueRef::Blob(bytes.into()))
}

/// Helper method called by [`Self::read`]. Reads the current value as a clob.
Expand Down
123 changes: 123 additions & 0 deletions src/lazy/bytes_ref.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
use crate::text::text_formatter::IonValueFormatter;
use crate::Bytes;
use std::borrow::Cow;
use std::fmt::{Debug, Display, Formatter};
use std::ops::Deref;

pub struct BytesRef<'data> {
data: Cow<'data, [u8]>,
}

impl<'data> Deref for BytesRef<'data> {
type Target = [u8];

fn deref(&self) -> &Self::Target {
self.data.as_ref()
}
}

impl<'data> BytesRef<'data> {
pub fn to_owned(&self) -> Bytes {
Bytes::from(self.as_ref())
}

pub fn into_owned(self) -> Bytes {
Bytes::from(self)
}

pub fn data(&self) -> &[u8] {
self.as_ref()
}
}

impl<'data> From<BytesRef<'data>> for Bytes {
fn from(value: BytesRef<'data>) -> Self {
match value.data {
Cow::Borrowed(bytes) => Bytes::from(bytes),
Cow::Owned(bytes) => Bytes::from(bytes),
}
}
}

impl<'data, const N: usize> From<&'data [u8; N]> for BytesRef<'data> {
fn from(bytes: &'data [u8; N]) -> Self {
BytesRef {
data: Cow::from(bytes.as_ref()),
}
}
}

impl<'data> From<&'data [u8]> for BytesRef<'data> {
fn from(bytes: &'data [u8]) -> Self {
BytesRef {
data: Cow::from(bytes),
}
}
}

impl<'data> From<Vec<u8>> for BytesRef<'data> {
fn from(bytes: Vec<u8>) -> Self {
BytesRef {
data: Cow::from(bytes),
}
}
}

impl<'data> From<&'data str> for BytesRef<'data> {
fn from(text: &'data str) -> Self {
BytesRef {
data: Cow::from(text.as_bytes()),
}
}
}

impl<'data> PartialEq<[u8]> for BytesRef<'data> {
fn eq(&self, other: &[u8]) -> bool {
self.data() == other
}
}

impl<'data> PartialEq<&[u8]> for BytesRef<'data> {
fn eq(&self, other: &&[u8]) -> bool {
self.data() == *other
}
}

impl<'data> PartialEq<BytesRef<'data>> for [u8] {
fn eq(&self, other: &BytesRef<'data>) -> bool {
self == other.data()
}
}

impl<'a, 'b> PartialEq<BytesRef<'a>> for BytesRef<'b> {
fn eq(&self, other: &BytesRef<'a>) -> bool {
self == other.data()
}
}

impl<'data> Display for BytesRef<'data> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let mut formatter = IonValueFormatter { output: f };
formatter
.format_blob(self.data())
.map_err(|_| std::fmt::Error)
}
}

impl<'data> Debug for BytesRef<'data> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
const NUM_BYTES_TO_SHOW: usize = 32;
let data = self.data.as_ref();
// Shows up to the first 32 bytes in hex
write!(f, "BytesRef: [")?;
for byte in data.iter().copied().take(NUM_BYTES_TO_SHOW) {
write!(f, "{:x} ", byte)?;
}
if data.len() > NUM_BYTES_TO_SHOW {
write!(f, "...{} more", (data.len() - NUM_BYTES_TO_SHOW))?;
}
write!(f, "]")?;

Ok(())
}
}
3 changes: 2 additions & 1 deletion src/lazy/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
//! Provides an ergonomic, lazy view of an Ion stream that permits random access within each
//! top level value.

mod any_encoding;
pub mod any_encoding;
pub mod binary;
pub mod bytes_ref;
pub mod decoder;
pub(crate) mod encoding;
pub mod raw_stream_item;
Expand Down
7 changes: 4 additions & 3 deletions src/lazy/raw_value_ref.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::lazy::bytes_ref::BytesRef;
use crate::lazy::decoder::LazyDecoder;
use crate::lazy::str_ref::StrRef;
use crate::result::IonFailure;
Expand All @@ -18,7 +19,7 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> {
Timestamp(Timestamp),
String(StrRef<'data>),
Symbol(RawSymbolTokenRef<'data>),
Blob(&'data [u8]),
Blob(BytesRef<'data>),
Clob(&'data [u8]),
SExp(D::SExp),
List(D::List),
Expand Down Expand Up @@ -140,7 +141,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> {
}
}

pub fn expect_blob(self) -> IonResult<&'data [u8]> {
pub fn expect_blob(self) -> IonResult<BytesRef<'data>> {
if let RawValueRef::Blob(b) = self {
Ok(b)
} else {
Expand Down Expand Up @@ -247,7 +248,7 @@ mod tests {
);
assert_eq!(
reader.next()?.expect_value()?.read()?.expect_blob()?,
&[0x06, 0x5A, 0x1B] // Base64-decoded "Blob"
[0x06u8, 0x5A, 0x1B].as_ref() // Base64-decoded "Blob"
);
assert_eq!(
reader.next()?.expect_value()?.read()?.expect_clob()?,
Expand Down
80 changes: 77 additions & 3 deletions src/lazy/text/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::str::FromStr;

use nom::branch::alt;
use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1, take_while_m_n};
use nom::character::streaming::{char, digit1, one_of, satisfy};
use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy};
use nom::combinator::{consumed, fail, map, not, opt, peek, recognize, success, value};
use nom::error::{ErrorKind, ParseError};
use nom::multi::{many0_count, many1_count};
Expand All @@ -17,8 +17,8 @@ use crate::lazy::encoding::TextEncoding;
use crate::lazy::raw_stream_item::RawStreamItem;
use crate::lazy::text::encoded_value::EncodedTextValue;
use crate::lazy::text::matched::{
MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, MatchedSymbol,
MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
MatchedBlob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString,
MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
};
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError};
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
Expand Down Expand Up @@ -497,6 +497,12 @@ impl<'data> TextBufferView<'data> {
)
},
),
map(
match_and_length(Self::match_blob),
|(matched_blob, length)| {
EncodedTextValue::new(MatchedValue::Blob(matched_blob), self.offset(), length)
},
),
map(
match_and_length(Self::match_list),
|(matched_list, length)| {
Expand Down Expand Up @@ -1337,6 +1343,28 @@ impl<'data> TextBufferView<'data> {
recognize(pair(Self::match_any_digit, Self::match_any_digit)),
)(self)
}

/// Matches a complete blob, including the opening `{{` and closing `}}`.
pub fn match_blob(self) -> IonParseResult<'data, MatchedBlob> {
delimited(
tag("{{"),
// Only whitespace (not comments) can appear within the blob
preceded(Self::match_optional_whitespace, Self::match_base64_content),
preceded(Self::match_optional_whitespace, tag("}}")),
)
.map(|base64_data| {
MatchedBlob::new(base64_data.offset() - self.offset(), base64_data.len())
})
.parse(self)
}

/// Matches the base64 content within a blob.
fn match_base64_content(self) -> IonMatchResult<'data> {
recognize(terminated(
many1_count(alt((alphanumeric1, is_a("+/")))),
opt(alt((tag("=="), tag("=")))),
))(self)
}
}

// === nom trait implementations ===
Expand Down Expand Up @@ -2002,4 +2030,50 @@ mod tests {
mismatch_sexp(input);
}
}

#[test]
fn test_match_blob() {
fn match_blob(input: &str) {
MatchTest::new(input).expect_match(match_length(TextBufferView::match_blob));
}
fn mismatch_blob(input: &str) {
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_blob));
}
// Base64 encodings of utf-8 strings
let good_inputs = &[
// hello
"{{aGVsbG8=}}",
"{{ aGVsbG8=}}",
"{{aGVsbG8= }}",
"{{\taGVsbG8=\n\n}}",
// hello!
"{{aGVsbG8h}}",
"{{ aGVsbG8h}}",
"{{aGVsbG8h }}",
"{{ aGVsbG8h }}",
// razzle dazzle root beer
"{{cmF6emxlIGRhenpsZSByb290IGJlZXI=}}",
"{{\ncmF6emxlIGRhenpsZSByb290IGJlZXI=\r}}",
];
for input in good_inputs {
match_blob(input);
}

let bad_inputs = &[
// illegal character $
"{{$aGVsbG8=}}",
// comment within braces
r#"{{
// Here's the data:
aGVsbG8=
}}"#,
// padding at the beginning
"{{=aGVsbG8}}",
// too much padding
"{{aGVsbG8===}}",
];
for input in bad_inputs {
mismatch_blob(input);
}
}
}
1 change: 1 addition & 0 deletions src/lazy/text/encoded_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ impl EncodedTextValue {
MatchedValue::Timestamp(_) => IonType::Timestamp,
MatchedValue::String(_) => IonType::String,
MatchedValue::Symbol(_) => IonType::Symbol,
MatchedValue::Blob(_) => IonType::Blob,
MatchedValue::List => IonType::List,
MatchedValue::SExp => IonType::SExp,
MatchedValue::Struct => IonType::Struct,
Expand Down
66 changes: 66 additions & 0 deletions src/lazy/text/matched.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use num_traits::Num;
use smallvec::SmallVec;

use crate::decimal::coefficient::{Coefficient, Sign};
use crate::lazy::bytes_ref::BytesRef;
use crate::lazy::str_ref::StrRef;
use crate::lazy::text::as_utf8::AsUtf8;
use crate::lazy::text::buffer::TextBufferView;
Expand All @@ -52,6 +53,7 @@ pub(crate) enum MatchedValue {
Timestamp(MatchedTimestamp),
String(MatchedString),
Symbol(MatchedSymbol),
Blob(MatchedBlob),
List,
SExp,
Struct,
Expand Down Expand Up @@ -750,6 +752,37 @@ impl MatchedHoursAndMinutes {
}
}

#[derive(Clone, Copy, Debug, PartialEq)]
pub struct MatchedBlob {
// Position within the blob at which the base64 characters begin
content_offset: usize,
// Length of the base64 characters
content_length: usize,
}

impl MatchedBlob {
pub fn new(content_offset: usize, content_length: usize) -> Self {
Self {
content_offset,
content_length,
}
}

pub(crate) fn read<'data>(
&self,
matched_input: TextBufferView<'data>,
) -> IonResult<BytesRef<'data>> {
let base64_text = matched_input.slice(self.content_offset, self.content_length);
base64::decode(base64_text.bytes())
.map_err(|e| {
IonError::decoding_error(format!(
"failed to parse blob with invalid base64 data:\n'{base64_text:?}'\n{e:?}:"
))
})
.map(BytesRef::from)
}
}

#[cfg(test)]
mod tests {
use crate::lazy::text::buffer::TextBufferView;
Expand Down Expand Up @@ -900,4 +933,37 @@ mod tests {

Ok(())
}

#[test]
fn read_blobs() -> IonResult<()> {
fn expect_blob(data: &str, expected: &str) {
let data = format!("{data} "); // Append a space
let buffer = TextBufferView::new(data.as_bytes());
let (_remaining, matched) = buffer.match_blob().unwrap();
let actual = matched.read(buffer).unwrap();
assert_eq!(
actual,
expected.as_ref(),
"Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}",
data,
actual,
expected
);
}

let tests = [
("{{TWVyY3VyeQ==}}", "Mercury"),
("{{VmVudXM=}}", "Venus"),
("{{RWFydGg=}}", "Earth"),
("{{TWFycw==}}", "Mars"),
("{{ TWFycw== }}", "Mars"),
("{{\nTWFycw==\t\t }}", "Mars"),
];

for (input, expected) in tests {
expect_blob(input, expected);
}

Ok(())
}
}
Loading

0 comments on commit fe922ff

Please sign in to comment.