diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index 231f9167..6dc5081d 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -406,7 +406,7 @@ impl<'data> LazyRawBinaryValue<'data> { fn read_blob(&self) -> ValueParseResult<'data, BinaryEncoding> { debug_assert!(self.encoded_value.ion_type() == IonType::Blob); let bytes = self.value_body()?; - Ok(RawValueRef::Blob(bytes)) + Ok(RawValueRef::Blob(bytes.into())) } /// Helper method called by [`Self::read`]. Reads the current value as a clob. diff --git a/src/lazy/bytes_ref.rs b/src/lazy/bytes_ref.rs new file mode 100644 index 00000000..5865314b --- /dev/null +++ b/src/lazy/bytes_ref.rs @@ -0,0 +1,123 @@ +use crate::text::text_formatter::IonValueFormatter; +use crate::Bytes; +use std::borrow::Cow; +use std::fmt::{Debug, Display, Formatter}; +use std::ops::Deref; + +pub struct BytesRef<'data> { + data: Cow<'data, [u8]>, +} + +impl<'data> Deref for BytesRef<'data> { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.data.as_ref() + } +} + +impl<'data> BytesRef<'data> { + pub fn to_owned(&self) -> Bytes { + Bytes::from(self.as_ref()) + } + + pub fn into_owned(self) -> Bytes { + Bytes::from(self) + } + + pub fn data(&self) -> &[u8] { + self.as_ref() + } +} + +impl<'data> From> for Bytes { + fn from(value: BytesRef<'data>) -> Self { + match value.data { + Cow::Borrowed(bytes) => Bytes::from(bytes), + Cow::Owned(bytes) => Bytes::from(bytes), + } + } +} + +impl<'data, const N: usize> From<&'data [u8; N]> for BytesRef<'data> { + fn from(bytes: &'data [u8; N]) -> Self { + BytesRef { + data: Cow::from(bytes.as_ref()), + } + } +} + +impl<'data> From<&'data [u8]> for BytesRef<'data> { + fn from(bytes: &'data [u8]) -> Self { + BytesRef { + data: Cow::from(bytes), + } + } +} + +impl<'data> From> for BytesRef<'data> { + fn from(bytes: Vec) -> Self { + BytesRef { + data: Cow::from(bytes), + } + } +} + +impl<'data> From<&'data str> for BytesRef<'data> { + fn from(text: &'data str) -> Self { + BytesRef { + data: Cow::from(text.as_bytes()), + } + } +} + +impl<'data> PartialEq<[u8]> for BytesRef<'data> { + fn eq(&self, other: &[u8]) -> bool { + self.data() == other + } +} + +impl<'data> PartialEq<&[u8]> for BytesRef<'data> { + fn eq(&self, other: &&[u8]) -> bool { + self.data() == *other + } +} + +impl<'data> PartialEq> for [u8] { + fn eq(&self, other: &BytesRef<'data>) -> bool { + self == other.data() + } +} + +impl<'a, 'b> PartialEq> for BytesRef<'b> { + fn eq(&self, other: &BytesRef<'a>) -> bool { + self == other.data() + } +} + +impl<'data> Display for BytesRef<'data> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut formatter = IonValueFormatter { output: f }; + formatter + .format_blob(self.data()) + .map_err(|_| std::fmt::Error) + } +} + +impl<'data> Debug for BytesRef<'data> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + const NUM_BYTES_TO_SHOW: usize = 32; + let data = self.data.as_ref(); + // Shows up to the first 32 bytes in hex + write!(f, "BytesRef: [")?; + for byte in data.iter().copied().take(NUM_BYTES_TO_SHOW) { + write!(f, "{:x} ", byte)?; + } + if data.len() > NUM_BYTES_TO_SHOW { + write!(f, "...{} more", (data.len() - NUM_BYTES_TO_SHOW))?; + } + write!(f, "]")?; + + Ok(()) + } +} diff --git a/src/lazy/mod.rs b/src/lazy/mod.rs index c2d2a4ce..32b16e12 100644 --- a/src/lazy/mod.rs +++ b/src/lazy/mod.rs @@ -1,8 +1,9 @@ //! Provides an ergonomic, lazy view of an Ion stream that permits random access within each //! top level value. -mod any_encoding; +pub mod any_encoding; pub mod binary; +pub mod bytes_ref; pub mod decoder; pub(crate) mod encoding; pub mod raw_stream_item; diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index 57934e0e..4c63bf33 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -1,3 +1,4 @@ +use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::LazyDecoder; use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; @@ -18,7 +19,7 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> { Timestamp(Timestamp), String(StrRef<'data>), Symbol(RawSymbolTokenRef<'data>), - Blob(&'data [u8]), + Blob(BytesRef<'data>), Clob(&'data [u8]), SExp(D::SExp), List(D::List), @@ -140,7 +141,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> { } } - pub fn expect_blob(self) -> IonResult<&'data [u8]> { + pub fn expect_blob(self) -> IonResult> { if let RawValueRef::Blob(b) = self { Ok(b) } else { @@ -247,7 +248,7 @@ mod tests { ); assert_eq!( reader.next()?.expect_value()?.read()?.expect_blob()?, - &[0x06, 0x5A, 0x1B] // Base64-decoded "Blob" + [0x06u8, 0x5A, 0x1B].as_ref() // Base64-decoded "Blob" ); assert_eq!( reader.next()?.expect_value()?.read()?.expect_clob()?, diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 6cb6430e..b999c14d 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -6,7 +6,7 @@ use std::str::FromStr; use nom::branch::alt; use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1, take_while_m_n}; -use nom::character::streaming::{char, digit1, one_of, satisfy}; +use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy}; use nom::combinator::{consumed, fail, map, not, opt, peek, recognize, success, value}; use nom::error::{ErrorKind, ParseError}; use nom::multi::{many0_count, many1_count}; @@ -17,8 +17,8 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::{ - MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, MatchedSymbol, - MatchedTimestamp, MatchedTimestampOffset, MatchedValue, + MatchedBlob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, + MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, }; use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; @@ -497,6 +497,12 @@ impl<'data> TextBufferView<'data> { ) }, ), + map( + match_and_length(Self::match_blob), + |(matched_blob, length)| { + EncodedTextValue::new(MatchedValue::Blob(matched_blob), self.offset(), length) + }, + ), map( match_and_length(Self::match_list), |(matched_list, length)| { @@ -1341,6 +1347,36 @@ impl<'data> TextBufferView<'data> { recognize(pair(one_of("012345"), Self::match_any_digit)), )(self) } + + /// Matches a complete blob, including the opening `{{` and closing `}}`. + pub fn match_blob(self) -> IonParseResult<'data, MatchedBlob> { + delimited( + tag("{{"), + // Only whitespace (not comments) can appear within the blob + recognize(Self::match_base64_content), + preceded(Self::match_optional_whitespace, tag("}}")), + ) + .map(|base64_data| { + MatchedBlob::new(base64_data.offset() - self.offset(), base64_data.len()) + }) + .parse(self) + } + + /// Matches the base64 content within a blob. Ion allows the base64 content to be broken up with + /// whitespace, so the matched input region may need to be stripped of whitespace before + /// the data can be decoded. + fn match_base64_content(self) -> IonMatchResult<'data> { + recognize(terminated( + many0_count(preceded( + Self::match_optional_whitespace, + alt((alphanumeric1, is_a("+/"))), + )), + opt(preceded( + Self::match_optional_whitespace, + alt((tag("=="), tag("="))), + )), + ))(self) + } } // === nom trait implementations === @@ -2008,4 +2044,59 @@ mod tests { mismatch_sexp(input); } } + + #[test] + fn test_match_blob() { + fn match_blob(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_blob)); + } + fn mismatch_blob(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_blob)); + } + // Base64 encodings of utf-8 strings + let good_inputs = &[ + // + "{{}}", + "{{ }}", + "{{\n\t}}", + // hello + "{{aGVsbG8=}}", + "{{ aGVsbG8=}}", + "{{aGVsbG8= }}", + "{{\taGVsbG8=\n\n}}", + "{{aG Vs bG 8 =}}", + r#"{{ + aG Vs + bG 8= + }}"#, + // hello! + "{{aGVsbG8h}}", + "{{ aGVsbG8h}}", + "{{aGVsbG8h }}", + "{{ aGVsbG8h }}", + // razzle dazzle root beer + "{{cmF6emxlIGRhenpsZSByb290IGJlZXI=}}", + "{{\ncmF6emxlIGRhenpsZSByb290IGJlZXI=\r}}", + ]; + for input in good_inputs { + match_blob(input); + } + + let bad_inputs = &[ + // illegal character $ + "{{$aGVsbG8=}}", + // comment within braces + r#"{{ + // Here's the data: + aGVsbG8= + }}"#, + // padding at the beginning + "{{=aGVsbG8}}", + // too much padding + "{{aGVsbG8===}}", + ]; + for input in bad_inputs { + mismatch_blob(input); + } + } } diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index 70b2687b..1423e743 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -130,6 +130,7 @@ impl EncodedTextValue { MatchedValue::Timestamp(_) => IonType::Timestamp, MatchedValue::String(_) => IonType::String, MatchedValue::Symbol(_) => IonType::Symbol, + MatchedValue::Blob(_) => IonType::Blob, MatchedValue::List => IonType::List, MatchedValue::SExp => IonType::SExp, MatchedValue::Struct => IonType::Struct, diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index e377320f..d7aaabe5 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -30,6 +30,7 @@ use num_traits::Num; use smallvec::SmallVec; use crate::decimal::coefficient::{Coefficient, Sign}; +use crate::lazy::bytes_ref::BytesRef; use crate::lazy::str_ref::StrRef; use crate::lazy::text::as_utf8::AsUtf8; use crate::lazy::text::buffer::TextBufferView; @@ -52,6 +53,7 @@ pub(crate) enum MatchedValue { Timestamp(MatchedTimestamp), String(MatchedString), Symbol(MatchedSymbol), + Blob(MatchedBlob), List, SExp, Struct, @@ -737,6 +739,57 @@ impl MatchedHoursAndMinutes { } } +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct MatchedBlob { + // Position within the blob at which the base64 characters begin + content_offset: usize, + // Length of the base64 characters + content_length: usize, +} + +impl MatchedBlob { + pub fn new(content_offset: usize, content_length: usize) -> Self { + Self { + content_offset, + content_length, + } + } + + pub(crate) fn read<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + let base64_text = matched_input.slice(self.content_offset, self.content_length); + let matched_bytes = base64_text.bytes(); + + // Ion allows whitespace to appear in the middle of the base64 data; if the match + // has inner whitespace, we need to strip it out. + let contains_whitespace = matched_bytes.iter().any(|b| b.is_ascii_whitespace()); + + let decode_result = if contains_whitespace { + // This allocates a fresh Vec to store the sanitized bytes. It could be replaced by + // a reusable buffer if this proves to be a bottleneck. + let sanitized_base64_text: Vec = matched_bytes + .iter() + .copied() + .filter(|b| !b.is_ascii_whitespace()) + .collect(); + base64::decode(sanitized_base64_text) + } else { + base64::decode(matched_bytes) + }; + + decode_result + .map_err(|e| { + IonError::decoding_error(format!( + "failed to parse blob with invalid base64 data:\n'{:?}'\n{e:?}:", + matched_input.bytes() + )) + }) + .map(BytesRef::from) + } +} + #[cfg(test)] mod tests { use crate::lazy::text::buffer::TextBufferView; @@ -887,4 +940,37 @@ mod tests { Ok(()) } + + #[test] + fn read_blobs() -> IonResult<()> { + fn expect_blob(data: &str, expected: &str) { + let data = format!("{data} "); // Append a space + let buffer = TextBufferView::new(data.as_bytes()); + let (_remaining, matched) = buffer.match_blob().unwrap(); + let actual = matched.read(buffer).unwrap(); + assert_eq!( + actual, + expected.as_ref(), + "Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}", + data, + actual, + expected + ); + } + + let tests = [ + ("{{TWVyY3VyeQ==}}", "Mercury"), + ("{{VmVudXM=}}", "Venus"), + ("{{RWFydGg=}}", "Earth"), + ("{{TWFycw==}}", "Mars"), + ("{{ TWFycw== }}", "Mars"), + ("{{\nTWFycw==\t\t }}", "Mars"), + ]; + + for (input, expected) in tests { + expect_blob(input, expected); + } + + Ok(()) + } } diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index d6207a00..81a4bd6e 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -164,6 +164,10 @@ mod tests { $10 $733 + // Blob + {{cmF6emxlIGRhenpsZSByb290IGJlZXI=}} + + // List [ // First item 1, @@ -172,13 +176,15 @@ mod tests { // Third item 3 ] - + + // S-Expression ( foo++ 2 3 ) - + + // Struct { // Identifier foo: 100, @@ -336,6 +342,9 @@ mod tests { RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(733)), ); + // {{cmF6emxlIGRhenpsZSByb290IGJlZXI=}} + expect_next(reader, RawValueRef::Blob("razzle dazzle root beer".into())); + // [1, 2, 3] let list = reader.next()?.expect_value()?.read()?.expect_list()?; let mut sum = 0; diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index e4a203c4..b5bab953 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -68,6 +68,7 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { MatchedValue::Timestamp(t) => RawValueRef::Timestamp(t.read(matched_input)?), MatchedValue::String(s) => RawValueRef::String(s.read(matched_input)?), MatchedValue::Symbol(s) => RawValueRef::Symbol(s.read(matched_input)?), + MatchedValue::Blob(b) => RawValueRef::Blob(b.read(matched_input)?), MatchedValue::List => { let lazy_list = LazyRawTextList { value: *self }; RawValueRef::List(lazy_list) diff --git a/src/lazy/value_ref.rs b/src/lazy/value_ref.rs index 58f5cc50..47a35d44 100644 --- a/src/lazy/value_ref.rs +++ b/src/lazy/value_ref.rs @@ -1,4 +1,5 @@ use crate::element::Value; +use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::LazyDecoder; use crate::lazy::r#struct::LazyStruct; use crate::lazy::sequence::{LazyList, LazySExp}; @@ -23,7 +24,7 @@ pub enum ValueRef<'top, 'data, D: LazyDecoder<'data>> { Timestamp(Timestamp), String(StrRef<'data>), Symbol(SymbolRef<'top>), - Blob(&'data [u8]), + Blob(BytesRef<'data>), Clob(&'data [u8]), SExp(LazySExp<'top, 'data, D>), List(LazyList<'top, 'data, D>), @@ -169,7 +170,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> ValueRef<'top, 'data, D> { } } - pub fn expect_blob(self) -> IonResult<&'data [u8]> { + pub fn expect_blob(self) -> IonResult> { if let ValueRef::Blob(b) = self { Ok(b) } else { @@ -256,7 +257,7 @@ mod tests { assert_eq!(reader.expect_next()?.read()?.expect_string()?, "hello"); assert_eq!( reader.expect_next()?.read()?.expect_blob()?, - &[0x06, 0x5A, 0x1B] // Base64-decoded "Blob" + [0x06u8, 0x5A, 0x1B].as_ref() // Base64-decoded "Blob" ); assert_eq!( reader.expect_next()?.read()?.expect_clob()?, @@ -310,7 +311,7 @@ mod tests { ); assert_eq!( reader.expect_next()?.read()?, - ValueRef::Blob(&[0x06, 0x5A, 0x1B]) // Base64-decoded "Blob" + ValueRef::Blob([0x06, 0x5A, 0x1B].as_ref().into()) // Base64-decoded "Blob" ); assert_eq!( reader.expect_next()?.read()?,