From dc8579dd7ddeec8adf135a0e58486c088fcfd128 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Tue, 22 Aug 2023 20:30:59 -0400 Subject: [PATCH] Adds `LazyRawTextReader` support for reading symbols (#616) --- src/binary/binary_writer.rs | 6 +- src/lazy/text/buffer.rs | 191 ++++++++++++++++++-- src/lazy/text/encoded_value.rs | 1 + src/lazy/text/matched.rs | 315 ++++++++++++++++++++------------- src/lazy/text/raw/reader.rs | 68 ++++++- src/lazy/text/value.rs | 1 + src/lazy/value.rs | 6 +- src/raw_symbol_token_ref.rs | 16 +- src/symbol_ref.rs | 46 +++-- src/text/raw_text_writer.rs | 5 +- src/text/text_formatter.rs | 5 +- src/text/text_writer.rs | 6 +- 12 files changed, 483 insertions(+), 183 deletions(-) diff --git a/src/binary/binary_writer.rs b/src/binary/binary_writer.rs index 305604b2..186f845b 100644 --- a/src/binary/binary_writer.rs +++ b/src/binary/binary_writer.rs @@ -128,7 +128,7 @@ impl IonWriter for BinaryWriter { panic!("Cannot set symbol ID ${symbol_id} as annotation. It is undefined."); } } - RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text), + RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()), }; self.raw_writer.add_annotation(symbol_id); } @@ -145,7 +145,7 @@ impl IonWriter for BinaryWriter { )); } } - RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text), + RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()), }; self.raw_writer.write_symbol(symbol_id) } @@ -159,7 +159,7 @@ impl IonWriter for BinaryWriter { panic!("Cannot set symbol ID ${symbol_id} as field name. It is undefined."); } } - RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text), + RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()), }; self.raw_writer.set_field_name(text); } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index ce6f0af7..83d2975d 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -5,8 +5,8 @@ use std::slice::Iter; use nom::branch::alt; use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1}; -use nom::character::streaming::{char, digit1, one_of}; -use nom::combinator::{fail, map, opt, peek, recognize, success, value}; +use nom::character::streaming::{char, digit1, one_of, satisfy}; +use nom::combinator::{fail, map, not, opt, peek, recognize, success, value}; use nom::error::{ErrorKind, ParseError}; use nom::multi::many0_count; use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; @@ -16,9 +16,9 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::{ - MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedValue, + MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedSymbol, MatchedValue, }; -use crate::lazy::text::parse_result::IonParseError; +use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; use crate::lazy::text::value::LazyRawTextValue; use crate::result::DecodingError; @@ -275,6 +275,16 @@ impl<'data> TextBufferView<'data> { ) }, ), + map( + match_and_length(Self::match_symbol), + |(matched_symbol, length)| { + EncodedTextValue::new( + MatchedValue::Symbol(matched_symbol), + self.offset(), + length, + ) + }, + ), // TODO: The other Ion types )) .map(|encoded_value| LazyRawTextValue { @@ -463,6 +473,7 @@ impl<'data> TextBufferView<'data> { Self::match_float_numeric_value, ))(self) } + /// Matches special IEEE-754 values, including +/- infinity and NaN. fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> { alt(( @@ -577,6 +588,109 @@ impl<'data> TextBufferView<'data> { /// Returns a matched buffer and a boolean indicating whether any escaped characters were /// found in the short string. fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { + Self::match_text_until_unescaped(self, b'\"') + } + + fn match_long_string(self) -> IonParseResult<'data, MatchedString> { + // TODO: implement long string matching + // The `fail` parser is a nom builtin that never matches. + fail(self) + } + + /// Matches a symbol ID (`$28`), an identifier (`foo`), or a quoted symbol (`'foo'`). + fn match_symbol(self) -> IonParseResult<'data, MatchedSymbol> { + // TODO: operators + alt(( + Self::match_symbol_id, + Self::match_identifier, + Self::match_quoted_symbol, + ))(self) + } + + /// Matches a symbol ID (`$28`). + fn match_symbol_id(self) -> IonParseResult<'data, MatchedSymbol> { + recognize(terminated( + // Discard a `$` and parse an integer representing the symbol ID. + // Note that symbol ID integers: + // * CANNOT have underscores in them. For example: `$1_0` is considered an identifier. + // * CAN have leading zeros. There's precedent for this in ion-java. + preceded(tag("$"), digit1), + // Peek at the next character to make sure it's unrelated to the symbol ID. + // The spec does not offer a formal definition of what ends a symbol ID. + // This checks for either a stop_character (which performs its own `peek()`) + // or a colon (":"), which could be a field delimiter (":") or the beginning of + // an annotation delimiter ('::'). + alt(( + // Each of the parsers passed to `alt` must have the same return type. `stop_character` + // returns a char instead of a &str, so we use `recognize()` to get a &str instead. + recognize(Self::peek_stop_character), + peek(tag(":")), // Field delimiter (":") or annotation delimiter ("::") + )), + )) + .map(|_matched| MatchedSymbol::SymbolId) + .parse(self) + } + + /// Matches an identifier (`foo`). + fn match_identifier(self) -> IonParseResult<'data, MatchedSymbol> { + let (remaining, identifier_text) = recognize(terminated( + pair( + Self::identifier_initial_character, + Self::identifier_trailing_characters, + ), + not(Self::identifier_trailing_character), + ))(self)?; + // Ion defines a number of keywords that are syntactically indistinguishable from + // identifiers. Keywords take precedence; we must ensure that any identifier we find + // is not actually a keyword. + const KEYWORDS: &[&str] = &["true", "false", "nan", "null"]; + // In many situations, this check will not be necessary. Another type's parser will + // recognize the keyword as its own. (For example, `parse_boolean` would match the input + // text `false`.) However, because symbols can appear in annotations and the check for + // annotations precedes the parsing for all other types, we need this extra verification. + if KEYWORDS + .iter() + .any(|k| k.as_bytes() == identifier_text.bytes()) + { + // Finding a keyword is not a fatal error, it just means that this parser doesn't match. + return Err(nom::Err::Error(IonParseError::Invalid( + InvalidInputError::new(self), + ))); + } + Ok((remaining, MatchedSymbol::Identifier)) + } + + /// Matches any character that can appear at the start of an identifier. + fn identifier_initial_character(self) -> IonParseResult<'data, Self> { + recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphabetic()))))(self) + } + + /// Matches any character that is legal in an identifier, though not necessarily at the beginning. + fn identifier_trailing_character(self) -> IonParseResult<'data, Self> { + recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphanumeric()))))(self) + } + + /// Matches characters that are legal in an identifier, though not necessarily at the beginning. + fn identifier_trailing_characters(self) -> IonParseResult<'data, Self> { + recognize(many0_count(Self::identifier_trailing_character))(self) + } + + /// Matches a quoted symbol (`'foo'`). + fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> { + delimited(char('\''), Self::match_quoted_symbol_body, char('\'')) + .map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars)) + .parse(self) + } + + /// Returns a matched buffer and a boolean indicating whether any escaped characters were + /// found in the short string. + fn match_quoted_symbol_body(self) -> IonParseResult<'data, (Self, bool)> { + Self::match_text_until_unescaped(self, b'\'') + } + + /// A helper method for matching bytes until the specified delimiter. Ignores any byte + /// (including the delimiter) that is prefaced by the escape character `\`. + fn match_text_until_unescaped(self, delimiter: u8) -> IonParseResult<'data, (Self, bool)> { let mut is_escaped = false; let mut contains_escaped_chars = false; for (index, byte) in self.bytes().iter().enumerate() { @@ -590,7 +704,7 @@ impl<'data> TextBufferView<'data> { contains_escaped_chars = true; continue; } - if *byte == b'\"' { + if *byte == delimiter { let matched = self.slice(0, index); let remaining = self.slice_to_end(index); return Ok((remaining, (matched, contains_escaped_chars))); @@ -598,12 +712,6 @@ impl<'data> TextBufferView<'data> { } Err(nom::Err::Incomplete(Needed::Unknown)) } - - fn match_long_string(self) -> IonParseResult<'data, MatchedString> { - // TODO: implement long string matching - // The `fail` parser is a nom builtin that never matches. - fail(self) - } } // === nom trait implementations === @@ -839,13 +947,17 @@ mod tests { P: Parser, O, IonParseError<'data>>, { let result = self.try_match(parser); - // We expect this to fail for one reason or another - assert!( - result.is_err(), - "Expected a parse failure for input: {:?}\nResult: {:?}", - self.input, - result - ); + // We expect that only part of the input will match or that the entire + // input will be rejected outright. + if let Ok((_remaining, match_length)) = result { + assert_ne!( + match_length, + self.input.len() - 1, + "parser unexpectedly matched the complete input: '{:?}\nResult: {:?}", + self.input, + result + ); + } } } @@ -1038,13 +1150,54 @@ mod tests { r#" hello" "#, - // Missing a trailing quote + // Missing a closing quote r#" "hello "#, + // Closing quote is escaped + r#" + "hello\" + "#, ]; for input in bad_inputs { mismatch_string(input); } } + + #[test] + fn test_match_symbol() { + fn match_symbol(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_symbol)); + } + fn mismatch_symbol(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_symbol)); + } + + // These inputs have leading/trailing whitespace to make them more readable, but the string + // matcher doesn't accept whitespace. We'll trim each one before testing it. + let good_inputs = &[ + "'hello'", + "'😀😀😀'", + "'this has an escaped quote \\' right in the middle'", + "$308", + "$0", + "foo", + "name", + "$bar", + "_baz_quux", + ]; + for input in good_inputs { + match_symbol(input); + } + + let bad_inputs = &[ + "'hello", // No closing quote + "'hello\\'", // Closing quote is escaped + "$-8", // Negative SID + "nan", // Identifier that is also a keyword + ]; + for input in bad_inputs { + mismatch_symbol(input); + } + } } diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index d994a2bc..873e8f2f 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -116,6 +116,7 @@ impl EncodedTextValue { MatchedValue::Int(_) => IonType::Int, MatchedValue::Float(_) => IonType::Float, MatchedValue::String(_) => IonType::String, + MatchedValue::Symbol(_) => IonType::Symbol, } } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 8725b9c8..233c5c11 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -19,7 +19,9 @@ //! use the previously recorded information to minimize the amount of information that needs to be //! re-discovered. +use std::borrow::Cow; use std::num::IntErrorKind; +use std::str::FromStr; use nom::character::is_hex_digit; use num_bigint::BigInt; @@ -31,7 +33,7 @@ use crate::lazy::text::as_utf8::AsUtf8; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::parse_result::InvalidInputError; use crate::result::{DecodingError, IonFailure}; -use crate::{Int, IonError, IonResult, IonType}; +use crate::{Int, IonError, IonResult, IonType, RawSymbolTokenRef}; /// A partially parsed Ion value. #[derive(Copy, Clone, Debug, PartialEq)] @@ -42,6 +44,7 @@ pub(crate) enum MatchedValue { Int(MatchedInt), Float(MatchedFloat), String(MatchedString), + Symbol(MatchedSymbol), // TODO: ...the other types } @@ -133,8 +136,6 @@ impl MatchedFloat { const STACK_ALLOC_BUFFER_CAPACITY: usize = 32; pub fn read(&self, matched_input: TextBufferView) -> IonResult { - use std::str::FromStr; - match self { MatchedFloat::PositiveInfinity => return Ok(f64::INFINITY), MatchedFloat::NegativeInfinity => return Ok(f64::NEG_INFINITY), @@ -212,140 +213,139 @@ impl MatchedString { // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); - Self::escape_short_string(body, &mut sanitized)?; + escape_text(body, &mut sanitized)?; let text = String::from_utf8(sanitized).unwrap(); Ok(StrRef::from(text.to_string())) } +} - fn escape_short_string( - matched_input: TextBufferView, - sanitized: &mut Vec, - ) -> IonResult<()> { - let mut remaining = matched_input; - while !remaining.is_empty() { - let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\'); - remaining = if let Some(escape_offset) = next_escape { - // Everything up to the '\' is already clean. Write that slice to 'sanitized'. - let already_clean = remaining.slice(0, escape_offset); - sanitized.extend_from_slice(already_clean.bytes()); - // Everything starting from the '\' needs to be evaluated. - let contains_escapes = remaining.slice_to_end(escape_offset); - Self::write_escaped(contains_escapes, sanitized)? - } else { - sanitized.extend_from_slice(remaining.bytes()); - // 'remaining' is now empty - remaining.slice_to_end(remaining.len()) - }; - } - - Ok(()) +fn escape_text(matched_input: TextBufferView, sanitized: &mut Vec) -> IonResult<()> { + let mut remaining = matched_input; + while !remaining.is_empty() { + let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\'); + remaining = if let Some(escape_offset) = next_escape { + // Everything up to the '\' is already clean. Write that slice to 'sanitized'. + let already_clean = remaining.slice(0, escape_offset); + sanitized.extend_from_slice(already_clean.bytes()); + // Everything starting from the '\' needs to be evaluated. + let contains_escapes = remaining.slice_to_end(escape_offset); + write_escaped(contains_escapes, sanitized)? + } else { + sanitized.extend_from_slice(remaining.bytes()); + // 'remaining' is now empty + remaining.slice_to_end(remaining.len()) + }; } - fn write_escaped<'data>( - input: TextBufferView<'data>, - sanitized: &mut Vec, - ) -> IonResult> { - // Note that by the time this method has been called, the parser has already confirmed that - // there is an appropriate closing delimiter. Thus, if any of the branches below run out of - // data, it means that it's a fatal error and not just an Incomplete. - debug_assert!(!input.is_empty()); - debug_assert!(input.bytes()[0] == b'\\'); - if input.len() == 1 { - return Err(IonError::Decoding( - DecodingError::new("found an escape ('\\') with no subsequent character") - .with_position(input.offset()), - )); - } - let input_after_escape = input.slice_to_end(2); // After (e.g.) '\x' - let escape_id = input.bytes()[1]; - let substitute = match escape_id { - b'n' => b'\n', - b'r' => b'\r', - b't' => b'\t', - b'\\' => b'\\', - b'/' => b'/', - b'"' => b'"', - b'\'' => b'\'', - b'?' => b'?', - b'0' => 0x00u8, // NUL - b'a' => 0x07u8, // alert BEL - b'b' => 0x08u8, // backspace - b'v' => 0x0Bu8, // vertical tab - b'f' => 0x0Cu8, // form feed - // If the byte following the '\' is a real newline (that is: 0x0A), we discard it. - b'\n' => return Ok(input_after_escape), - // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes - b'x' => return Self::hex_digits_code_point(2, input_after_escape, sanitized), - b'u' => return Self::hex_digits_code_point(4, input_after_escape, sanitized), - b'U' => return Self::hex_digits_code_point(8, input_after_escape, sanitized), - _ => { - return Err(IonError::Decoding( - DecodingError::new(format!("invalid escape sequence '\\{}", escape_id)) - .with_position(input.offset()), - )) - } - }; + Ok(()) +} - sanitized.push(substitute); - Ok(input_after_escape) +fn write_escaped<'data>( + input: TextBufferView<'data>, + sanitized: &mut Vec, +) -> IonResult> { + // Note that by the time this method has been called, the parser has already confirmed that + // there is an appropriate closing delimiter. Thus, if any of the branches below run out of + // data, it means that it's a fatal error and not just an Incomplete. + debug_assert!(!input.is_empty()); + debug_assert!(input.bytes()[0] == b'\\'); + if input.len() == 1 { + return Err(IonError::Decoding( + DecodingError::new("found an escape ('\\') with no subsequent character") + .with_position(input.offset()), + )); } - - fn hex_digits_code_point<'data>( - num_digits: usize, - input: TextBufferView<'data>, - sanitized: &mut Vec, - ) -> IonResult> { - if input.len() < num_digits { + let input_after_escape = input.slice_to_end(2); // After (e.g.) '\x' + let escape_id = input.bytes()[1]; + let substitute = match escape_id { + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'\\' => b'\\', + b'/' => b'/', + b'"' => b'"', + b'\'' => b'\'', + b'?' => b'?', + b'0' => 0x00u8, // NUL + b'a' => 0x07u8, // alert BEL + b'b' => 0x08u8, // backspace + b'v' => 0x0Bu8, // vertical tab + b'f' => 0x0Cu8, // form feed + // If the byte following the '\' is a real newline (that is: 0x0A), we discard it. + b'\n' => return Ok(input_after_escape), + // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes + b'x' => return hex_digits_code_point(2, input_after_escape, sanitized), + b'u' => return hex_digits_code_point(4, input_after_escape, sanitized), + b'U' => return hex_digits_code_point(8, input_after_escape, sanitized), + _ => { return Err(IonError::Decoding( - DecodingError::new(format!( - "found a {}-hex-digit escape sequence with only {} digits", - num_digits, - input.len() - )) - .with_position(input.offset()), - )); + DecodingError::new(format!("invalid escape sequence '\\{}", escape_id)) + .with_position(input.offset()), + )) } + }; - let hex_digit_bytes = &input.bytes()[..num_digits]; - - let all_are_hex_digits = hex_digit_bytes - .iter() - .take(num_digits) - .copied() - .all(is_hex_digit); - if !all_are_hex_digits { - return Err(IonError::Decoding( - DecodingError::new(format!( - "found a {}-hex-digit escape sequence that contained an invalid hex digit", - num_digits, - )) - .with_position(input.offset()), - )); - } - // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail. - let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap(); - let code_point = u32::from_str_radix(hex_digits, 16).unwrap(); - - // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another - // unicode escape representing the low surrogate has to be next in the input to complete it. - // See the docs for this helper function for details. (Note: this will only ever be true for - // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a - // high surrogate.) - if code_point_is_a_high_surrogate(code_point) { - todo!("support surrogate pairs") - } + sanitized.push(substitute); + Ok(input_after_escape) +} - // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a - // surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar - // value. We can safely convert it to a `char`. - let character = char::from_u32(code_point).unwrap(); - let utf8_buffer: &mut [u8; 4] = &mut [0; 4]; - let utf8_encoded = character.encode_utf8(utf8_buffer); - sanitized.extend_from_slice(utf8_encoded.as_bytes()); +/// Reads the next `num_digits` bytes from `input` as a `char`, then writes that `char`'s UTF8 bytes +/// to `sanitized`. +fn hex_digits_code_point<'data>( + num_digits: usize, + input: TextBufferView<'data>, + sanitized: &mut Vec, +) -> IonResult> { + if input.len() < num_digits { + return Err(IonError::Decoding( + DecodingError::new(format!( + "found a {}-hex-digit escape sequence with only {} digits", + num_digits, + input.len() + )) + .with_position(input.offset()), + )); + } - // Skip beyond the digits we just processed - Ok(input.slice_to_end(num_digits)) + let hex_digit_bytes = &input.bytes()[..num_digits]; + + let all_are_hex_digits = hex_digit_bytes + .iter() + .take(num_digits) + .copied() + .all(is_hex_digit); + if !all_are_hex_digits { + return Err(IonError::Decoding( + DecodingError::new(format!( + "found a {}-hex-digit escape sequence that contained an invalid hex digit", + num_digits, + )) + .with_position(input.offset()), + )); } + // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail. + let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap(); + let code_point = u32::from_str_radix(hex_digits, 16).unwrap(); + + // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another + // unicode escape representing the low surrogate has to be next in the input to complete it. + // See the docs for this helper function for details. (Note: this will only ever be true for + // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a + // high surrogate.) + if code_point_is_a_high_surrogate(code_point) { + todo!("support surrogate pairs") + } + + // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a + // surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar + // value. We can safely convert it to a `char`. + let character = char::from_u32(code_point).unwrap(); + let utf8_buffer: &mut [u8; 4] = &mut [0; 4]; + let utf8_encoded = character.encode_utf8(utf8_buffer); + sanitized.extend_from_slice(utf8_encoded.as_bytes()); + + // Skip beyond the digits we just processed + Ok(input.slice_to_end(num_digits)) } /// Returns `true` if the provided code point is a utf-16 high surrogate. @@ -373,3 +373,72 @@ impl MatchedString { fn code_point_is_a_high_surrogate(value: u32) -> bool { (0xD800..=0xDFFF).contains(&value) } + +#[derive(Copy, Clone, Debug, PartialEq)] +pub(crate) enum MatchedSymbol { + /// A numeric symbol ID (e.g. `$21`) + SymbolId, + /// The symbol is an unquoted identifier (e.g. `foo`) + Identifier, + /// The symbol is delimited by single quotes. Holds a `bool` indicating whether the + /// matched input contained any escaped bytes. + Quoted(bool), + // TODO: Operators in S-Expressions +} + +impl MatchedSymbol { + pub fn read<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + match self { + MatchedSymbol::SymbolId => self.read_symbol_id(matched_input), + MatchedSymbol::Identifier => self.read_identifier(matched_input), + MatchedSymbol::Quoted(contains_escaped_chars) => { + self.read_quoted(matched_input, *contains_escaped_chars) + } + } + } + + fn read_quoted<'data>( + &self, + matched_input: TextBufferView<'data>, + contains_escaped_chars: bool, + ) -> IonResult> { + // Take a slice of the input that ignores the first and last bytes, which are quotes. + let body = matched_input.slice(1, matched_input.len() - 2); + if !contains_escaped_chars { + // There are no escaped characters, so we can just validate the string in-place. + let text = body.as_text()?; + let str_ref = RawSymbolTokenRef::Text(text.into()); + return Ok(str_ref); + } + + // Otherwise, there are escaped characters. We need to build a new version of our symbol + // that replaces the escaped characters with their corresponding bytes. + let mut sanitized = Vec::with_capacity(matched_input.len()); + + escape_text(body, &mut sanitized)?; + let text = String::from_utf8(sanitized).unwrap(); + Ok(RawSymbolTokenRef::Text(text.into())) + } + fn read_identifier<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + matched_input + .as_text() + .map(|t| RawSymbolTokenRef::Text(Cow::Borrowed(t))) + } + fn read_symbol_id<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // Skip past the first byte, which has to be a `$`. + let text = matched_input.slice_to_end(1).as_text()?; + // It's not possible for the number parsing to fail because the matcher's rules + // guarantee that this string contains only decimal digits. + let sid = usize::from_str(text).expect("loading symbol ID as usize"); + Ok(RawSymbolTokenRef::SymbolId(sid)) + } +} diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 52346b8f..d45a52ce 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -71,11 +71,13 @@ mod tests { use super::*; use crate::lazy::decoder::LazyRawValue; use crate::lazy::raw_value_ref::RawValueRef; - use crate::IonType; + use crate::{IonType, RawSymbolTokenRef}; #[test] fn test_top_level() -> IonResult<()> { - let data = r#" + let mut data = String::new(); + data.push_str( + r#" /* This test demonstrates lazily reading top-level values of various Ion types. The values are interspersed with @@ -117,14 +119,30 @@ mod tests { "\x48ello, \x77orld!" // \x 2-digit hex escape "\u0048ello, \u0077orld!" // \u 4-digit hex escape "\U00000048ello, \U00000077orld!" // \U 8-digit hex escape - - "#; - // Make a mutable string so we can append some things that require Rust-level escapes - let mut data = String::from(data); + "#, + ); // Escaped newlines are discarded data.push_str("\"Hello,\\\n world!\""); + data.push_str( + r#" + // Symbols + + 'foo' + 'Hello, world!' + '😎😎😎' + + firstName + date_of_birth + $variable + + $0 + $10 + $733 + "#, + ); + fn expect_next<'data>( reader: &mut LazyRawTextReader<'data>, expected: RawValueRef<'data, TextEncoding>, @@ -197,6 +215,44 @@ mod tests { expect_next(reader, RawValueRef::String("Hello, world!".into())); // "\"Hello,\\\n world!\" " expect_next(reader, RawValueRef::String("Hello, world!".into())); + // 'foo' + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("foo".into())), + ); + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("Hello, world!".into())), + ); + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("😎😎😎".into())), + ); + // firstName + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("firstName".into())), + ); + // date_of_birth + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("date_of_birth".into())), + ); + // $variable + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("$variable".into())), + ); + // $0 + expect_next(reader, RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(0))); + // $10 + expect_next(reader, RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(10))); + // $733 + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(733)), + ); + Ok(()) } } diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index 8f60346c..d0c4365c 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -53,6 +53,7 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { MatchedValue::Float(f) => RawValueRef::Float(f.read(matched_input)?), // ...decimal, timestamp... MatchedValue::String(s) => RawValueRef::String(s.read(matched_input)?), + MatchedValue::Symbol(s) => RawValueRef::Symbol(s.read(matched_input)?), // ...and the rest! }; Ok(value_ref) diff --git a/src/lazy/value.rs b/src/lazy/value.rs index 8f09cdbf..f25caf7e 100644 --- a/src/lazy/value.rs +++ b/src/lazy/value.rs @@ -9,6 +9,7 @@ use crate::{ Annotations, Element, IntoAnnotatedElement, IonError, IonResult, IonType, RawSymbolTokenRef, SymbolRef, SymbolTable, Value, }; +use std::borrow::Cow; /// A value in a binary Ion stream whose header has been parsed but whose body (i.e. its data) has /// not. A `LazyValue` is immutable; its data can be read any number of times. @@ -184,7 +185,8 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyValue<'top, 'data, D> { )) })? .into(), - RawSymbolTokenRef::Text(text) => text.into(), + RawSymbolTokenRef::Text(Cow::Borrowed(text)) => text.into(), + RawSymbolTokenRef::Text(Cow::Owned(text)) => text.into(), }; ValueRef::Symbol(symbol) } @@ -333,7 +335,7 @@ where )), Some(symbol) => Some(Ok(symbol.into())), }, - Ok(RawSymbolTokenRef::Text(text)) => Some(Ok(SymbolRef::with_text(text))), + Ok(RawSymbolTokenRef::Text(text)) => Some(Ok(text.into())), Err(e) => Some(Err(e)), } } diff --git a/src/raw_symbol_token_ref.rs b/src/raw_symbol_token_ref.rs index d4a00c4d..dddedc7c 100644 --- a/src/raw_symbol_token_ref.rs +++ b/src/raw_symbol_token_ref.rs @@ -1,11 +1,12 @@ use crate::raw_symbol_token::RawSymbolToken; use crate::{Symbol, SymbolId}; +use std::borrow::Cow; /// Like RawSymbolToken, but the Text variant holds a borrowed reference instead of a String. #[derive(Debug, Clone, PartialEq, Eq)] pub enum RawSymbolTokenRef<'a> { SymbolId(SymbolId), - Text(&'a str), + Text(Cow<'a, str>), } /// Implemented by types that can be viewed as a [RawSymbolTokenRef] without allocations. @@ -15,10 +16,7 @@ pub trait AsRawSymbolTokenRef { impl<'a> AsRawSymbolTokenRef for RawSymbolTokenRef<'a> { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { - match self { - RawSymbolTokenRef::SymbolId(sid) => RawSymbolTokenRef::SymbolId(*sid), - RawSymbolTokenRef::Text(text) => RawSymbolTokenRef::Text(text), - } + self.clone() } } @@ -30,20 +28,20 @@ impl AsRawSymbolTokenRef for SymbolId { impl AsRawSymbolTokenRef for String { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { - RawSymbolTokenRef::Text(self.as_str()) + RawSymbolTokenRef::Text(Cow::from(self.as_str())) } } impl AsRawSymbolTokenRef for &str { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { - RawSymbolTokenRef::Text(self) + RawSymbolTokenRef::Text(Cow::from(*self)) } } impl AsRawSymbolTokenRef for Symbol { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { match self.text() { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(Cow::from(text)), None => RawSymbolTokenRef::SymbolId(0), } } @@ -62,7 +60,7 @@ impl AsRawSymbolTokenRef for RawSymbolToken { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { match self { RawSymbolToken::SymbolId(sid) => RawSymbolTokenRef::SymbolId(*sid), - RawSymbolToken::Text(text) => RawSymbolTokenRef::Text(text.as_str()), + RawSymbolToken::Text(text) => RawSymbolTokenRef::Text(Cow::from(text.as_str())), } } } diff --git a/src/symbol_ref.rs b/src/symbol_ref.rs index 9cd42cac..815c75fe 100644 --- a/src/symbol_ref.rs +++ b/src/symbol_ref.rs @@ -1,5 +1,5 @@ use crate::Symbol; -use std::borrow::Borrow; +use std::borrow::{Borrow, Cow}; use std::fmt::{Debug, Formatter}; use std::hash::{Hash, Hasher}; @@ -7,19 +7,19 @@ use std::hash::{Hash, Hasher}; /// static lifetime), a `SymbolRef` may have known or undefined text (i.e. `$0`). #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] pub struct SymbolRef<'a> { - text: Option<&'a str>, + text: Option>, } impl<'a> Debug for SymbolRef<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.text.unwrap_or("$0")) + write!(f, "{}", self.text().unwrap_or("$0")) } } impl<'a> SymbolRef<'a> { /// If this symbol has known text, returns `Some(&str)`. Otherwise, returns `None`. pub fn text(&self) -> Option<&str> { - self.text + self.text.as_ref().map(|t| t.as_ref()) } /// Constructs a `SymbolRef` with unknown text. @@ -28,14 +28,17 @@ impl<'a> SymbolRef<'a> { } /// Constructs a `SymbolRef` with the specified text. - pub fn with_text(text: &str) -> SymbolRef { - SymbolRef { text: Some(text) } + pub fn with_text(text: impl Into>) -> SymbolRef<'a> { + SymbolRef { + text: Some(text.into()), + } } pub fn to_owned(self) -> Symbol { - match self.text() { + match self.text { None => Symbol::unknown_text(), - Some(text) => Symbol::owned(text), + Some(Cow::Borrowed(text)) => Symbol::owned(text), + Some(Cow::Owned(text)) => Symbol::owned(text), } } } @@ -60,14 +63,14 @@ pub trait AsSymbolRef { impl<'a, A: AsRef + 'a> AsSymbolRef for A { fn as_symbol_ref(&self) -> SymbolRef { SymbolRef { - text: Some(self.as_ref()), + text: Some(Cow::Borrowed(self.as_ref())), } } } impl<'a> Hash for SymbolRef<'a> { fn hash(&self, state: &mut H) { - match self.text { + match self.text() { None => 0.hash(state), Some(text) => text.hash(state), } @@ -76,18 +79,33 @@ impl<'a> Hash for SymbolRef<'a> { impl<'a> From<&'a str> for SymbolRef<'a> { fn from(text: &'a str) -> Self { - Self { text: Some(text) } + Self { + text: Some(Cow::Borrowed(text)), + } } } -impl<'a> From<&'a Symbol> for SymbolRef<'a> { - fn from(symbol: &'a Symbol) -> Self { +impl<'a> From for SymbolRef<'a> { + fn from(text: String) -> Self { Self { - text: symbol.text(), + text: Some(Cow::Owned(text)), } } } +impl<'a> From> for SymbolRef<'a> { + fn from(value: Cow<'a, str>) -> Self { + Self { text: Some(value) } + } +} + +impl<'a> From<&'a Symbol> for SymbolRef<'a> { + fn from(symbol: &'a Symbol) -> Self { + let text = symbol.text().map(Cow::Borrowed); + Self { text } + } +} + // Note that this method panics if the SymbolRef has unknown text! This is unfortunate but is required // in order to allow a HashMap to do lookups with a &str instead of a &SymbolRef impl<'a> Borrow for SymbolRef<'a> { diff --git a/src/text/raw_text_writer.rs b/src/text/raw_text_writer.rs index b0e75717..68a043a0 100644 --- a/src/text/raw_text_writer.rs +++ b/src/text/raw_text_writer.rs @@ -320,12 +320,13 @@ impl RawTextWriter { match token.as_raw_symbol_token_ref() { RawSymbolTokenRef::SymbolId(sid) => write!(output, "${sid}")?, RawSymbolTokenRef::Text(text) - if Self::token_is_keyword(text) || Self::token_resembles_symbol_id(text) => + if Self::token_is_keyword(text.as_ref()) + || Self::token_resembles_symbol_id(text.as_ref()) => { // Write the symbol text in single quotes write!(output, "'{text}'")?; } - RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text) => { + RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text.as_ref()) => { // Write the symbol text without quotes write!(output, "{text}")? } diff --git a/src/text/text_formatter.rs b/src/text/text_formatter.rs index 828e9fb5..404d556d 100644 --- a/src/text/text_formatter.rs +++ b/src/text/text_formatter.rs @@ -229,12 +229,13 @@ impl<'a, W: std::fmt::Write> IonValueFormatter<'a, W> { match token.as_raw_symbol_token_ref() { RawSymbolTokenRef::SymbolId(sid) => write!(self.output, "${sid}")?, RawSymbolTokenRef::Text(text) - if Self::token_is_keyword(text) || Self::token_resembles_symbol_id(text) => + if Self::token_is_keyword(text.as_ref()) + || Self::token_resembles_symbol_id(text.as_ref()) => { // Write the symbol text in single quotes write!(self.output, "'{text}'")?; } - RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text) => { + RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text.as_ref()) => { // Write the symbol text without quotes write!(self.output, "{text}")? } diff --git a/src/text/text_writer.rs b/src/text/text_writer.rs index 7fc140d0..c4829974 100644 --- a/src/text/text_writer.rs +++ b/src/text/text_writer.rs @@ -123,7 +123,7 @@ impl IonWriter for TextWriter { RawSymbolTokenRef::SymbolId(symbol_id) => { // Get the text associated with this symbol ID match self.symbol_table.text_for(symbol_id) { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(text.into()), None => RawSymbolTokenRef::SymbolId(symbol_id), } } @@ -138,7 +138,7 @@ impl IonWriter for TextWriter { RawSymbolTokenRef::SymbolId(symbol_id) => { // Get the text associated with this symbol ID match self.symbol_table.text_for(symbol_id) { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(text.into()), None => RawSymbolTokenRef::SymbolId(symbol_id), } } @@ -152,7 +152,7 @@ impl IonWriter for TextWriter { RawSymbolTokenRef::SymbolId(symbol_id) => { // Get the text associated with this symbol ID match self.symbol_table.text_for(symbol_id) { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(text.into()), None => RawSymbolTokenRef::SymbolId(symbol_id), } }