From dc8579dd7ddeec8adf135a0e58486c088fcfd128 Mon Sep 17 00:00:00 2001
From: Zack Slayton <zack.slayton@gmail.com>
Date: Tue, 22 Aug 2023 20:30:59 -0400
Subject: [PATCH] Adds `LazyRawTextReader` support for reading symbols (#616)

---
 src/binary/binary_writer.rs    |   6 +-
 src/lazy/text/buffer.rs        | 191 ++++++++++++++++++--
 src/lazy/text/encoded_value.rs |   1 +
 src/lazy/text/matched.rs       | 315 ++++++++++++++++++++-------------
 src/lazy/text/raw/reader.rs    |  68 ++++++-
 src/lazy/text/value.rs         |   1 +
 src/lazy/value.rs              |   6 +-
 src/raw_symbol_token_ref.rs    |  16 +-
 src/symbol_ref.rs              |  46 +++--
 src/text/raw_text_writer.rs    |   5 +-
 src/text/text_formatter.rs     |   5 +-
 src/text/text_writer.rs        |   6 +-
 12 files changed, 483 insertions(+), 183 deletions(-)
diff --git a/src/binary/binary_writer.rs b/src/binary/binary_writer.rs
index 305604b2..186f845b 100644
--- a/src/binary/binary_writer.rs
+++ b/src/binary/binary_writer.rs
@@ -128,7 +128,7 @@ impl<W: Write> IonWriter for BinaryWriter<W> {
                         panic!("Cannot set symbol ID ${symbol_id} as annotation. It is undefined.");
                     }
                 }
-                RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text),
+                RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()),
             };
             self.raw_writer.add_annotation(symbol_id);
         }
@@ -145,7 +145,7 @@ impl<W: Write> IonWriter for BinaryWriter<W> {
                     ));
                 }
             }
-            RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text),
+            RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()),
         };
         self.raw_writer.write_symbol(symbol_id)
     }
@@ -159,7 +159,7 @@ impl<W: Write> IonWriter for BinaryWriter<W> {
                     panic!("Cannot set symbol ID ${symbol_id} as field name. It is undefined.");
                 }
             }
-            RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text),
+            RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()),
         };
         self.raw_writer.set_field_name(text);
     }
diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs
index ce6f0af7..83d2975d 100644
--- a/src/lazy/text/buffer.rs
+++ b/src/lazy/text/buffer.rs
@@ -5,8 +5,8 @@ use std::slice::Iter;
 
 use nom::branch::alt;
 use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1};
-use nom::character::streaming::{char, digit1, one_of};
-use nom::combinator::{fail, map, opt, peek, recognize, success, value};
+use nom::character::streaming::{char, digit1, one_of, satisfy};
+use nom::combinator::{fail, map, not, opt, peek, recognize, success, value};
 use nom::error::{ErrorKind, ParseError};
 use nom::multi::many0_count;
 use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
@@ -16,9 +16,9 @@ use crate::lazy::encoding::TextEncoding;
 use crate::lazy::raw_stream_item::RawStreamItem;
 use crate::lazy::text::encoded_value::EncodedTextValue;
 use crate::lazy::text::matched::{
-    MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedValue,
+    MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedSymbol, MatchedValue,
 };
-use crate::lazy::text::parse_result::IonParseError;
+use crate::lazy::text::parse_result::{InvalidInputError, IonParseError};
 use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
 use crate::lazy::text::value::LazyRawTextValue;
 use crate::result::DecodingError;
@@ -275,6 +275,16 @@ impl<'data> TextBufferView<'data> {
                     )
                 },
             ),
+            map(
+                match_and_length(Self::match_symbol),
+                |(matched_symbol, length)| {
+                    EncodedTextValue::new(
+                        MatchedValue::Symbol(matched_symbol),
+                        self.offset(),
+                        length,
+                    )
+                },
+            ),
             // TODO: The other Ion types
         ))
         .map(|encoded_value| LazyRawTextValue {
@@ -463,6 +473,7 @@ impl<'data> TextBufferView<'data> {
             Self::match_float_numeric_value,
         ))(self)
     }
+
     /// Matches special IEEE-754 values, including +/- infinity and NaN.
     fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> {
         alt((
@@ -577,6 +588,109 @@ impl<'data> TextBufferView<'data> {
     /// Returns a matched buffer and a boolean indicating whether any escaped characters were
     /// found in the short string.
     fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> {
+        Self::match_text_until_unescaped(self, b'\"')
+    }
+
+    fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
+        // TODO: implement long string matching
+        //       The `fail` parser is a nom builtin that never matches.
+        fail(self)
+    }
+
+    /// Matches a symbol ID (`$28`), an identifier (`foo`), or a quoted symbol (`'foo'`).
+    fn match_symbol(self) -> IonParseResult<'data, MatchedSymbol> {
+        // TODO: operators
+        alt((
+            Self::match_symbol_id,
+            Self::match_identifier,
+            Self::match_quoted_symbol,
+        ))(self)
+    }
+
+    /// Matches a symbol ID (`$28`).
+    fn match_symbol_id(self) -> IonParseResult<'data, MatchedSymbol> {
+        recognize(terminated(
+            // Discard a `$` and parse an integer representing the symbol ID.
+            // Note that symbol ID integers:
+            //   * CANNOT have underscores in them. For example: `$1_0` is considered an identifier.
+            //   * CAN have leading zeros. There's precedent for this in ion-java.
+            preceded(tag("$"), digit1),
+            // Peek at the next character to make sure it's unrelated to the symbol ID.
+            // The spec does not offer a formal definition of what ends a symbol ID.
+            // This checks for either a stop_character (which performs its own `peek()`)
+            // or a colon (":"), which could be a field delimiter (":") or the beginning of
+            // an annotation delimiter ('::').
+            alt((
+                // Each of the parsers passed to `alt` must have the same return type. `stop_character`
+                // returns a char instead of a &str, so we use `recognize()` to get a &str instead.
+                recognize(Self::peek_stop_character),
+                peek(tag(":")), // Field delimiter (":") or annotation delimiter ("::")
+            )),
+        ))
+        .map(|_matched| MatchedSymbol::SymbolId)
+        .parse(self)
+    }
+
+    /// Matches an identifier (`foo`).
+    fn match_identifier(self) -> IonParseResult<'data, MatchedSymbol> {
+        let (remaining, identifier_text) = recognize(terminated(
+            pair(
+                Self::identifier_initial_character,
+                Self::identifier_trailing_characters,
+            ),
+            not(Self::identifier_trailing_character),
+        ))(self)?;
+        // Ion defines a number of keywords that are syntactically indistinguishable from
+        // identifiers. Keywords take precedence; we must ensure that any identifier we find
+        // is not actually a keyword.
+        const KEYWORDS: &[&str] = &["true", "false", "nan", "null"];
+        // In many situations, this check will not be necessary. Another type's parser will
+        // recognize the keyword as its own. (For example, `parse_boolean` would match the input
+        // text `false`.) However, because symbols can appear in annotations and the check for
+        // annotations precedes the parsing for all other types, we need this extra verification.
+        if KEYWORDS
+            .iter()
+            .any(|k| k.as_bytes() == identifier_text.bytes())
+        {
+            // Finding a keyword is not a fatal error, it just means that this parser doesn't match.
+            return Err(nom::Err::Error(IonParseError::Invalid(
+                InvalidInputError::new(self),
+            )));
+        }
+        Ok((remaining, MatchedSymbol::Identifier))
+    }
+
+    /// Matches any character that can appear at the start of an identifier.
+    fn identifier_initial_character(self) -> IonParseResult<'data, Self> {
+        recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphabetic()))))(self)
+    }
+
+    /// Matches any character that is legal in an identifier, though not necessarily at the beginning.
+    fn identifier_trailing_character(self) -> IonParseResult<'data, Self> {
+        recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphanumeric()))))(self)
+    }
+
+    /// Matches characters that are legal in an identifier, though not necessarily at the beginning.
+    fn identifier_trailing_characters(self) -> IonParseResult<'data, Self> {
+        recognize(many0_count(Self::identifier_trailing_character))(self)
+    }
+
+    /// Matches a quoted symbol (`'foo'`).
+    fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> {
+        delimited(char('\''), Self::match_quoted_symbol_body, char('\''))
+            .map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars))
+            .parse(self)
+    }
+
+    /// Returns a matched buffer and a boolean indicating whether any escaped characters were
+    /// found in the short string.
+    fn match_quoted_symbol_body(self) -> IonParseResult<'data, (Self, bool)> {
+        Self::match_text_until_unescaped(self, b'\'')
+    }
+
+    /// A helper method for matching bytes until the specified delimiter. Ignores any byte
+    /// (including the delimiter) that is prefaced by the escape character `\`.
+    fn match_text_until_unescaped(self, delimiter: u8) -> IonParseResult<'data, (Self, bool)> {
         let mut is_escaped = false;
         let mut contains_escaped_chars = false;
         for (index, byte) in self.bytes().iter().enumerate() {
@@ -590,7 +704,7 @@ impl<'data> TextBufferView<'data> {
                 contains_escaped_chars = true;
                 continue;
             }
-            if *byte == b'\"' {
+            if *byte == delimiter {
                 let matched = self.slice(0, index);
                 let remaining = self.slice_to_end(index);
                 return Ok((remaining, (matched, contains_escaped_chars)));
@@ -598,12 +712,6 @@ impl<'data> TextBufferView<'data> {
         }
         Err(nom::Err::Incomplete(Needed::Unknown))
     }
-
-    fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
-        // TODO: implement long string matching
-        //       The `fail` parser is a nom builtin that never matches.
-        fail(self)
-    }
 }
 
 // === nom trait implementations ===
@@ -839,13 +947,17 @@ mod tests {
             P: Parser<TextBufferView<'data>, O, IonParseError<'data>>,
         {
             let result = self.try_match(parser);
-            // We expect this to fail for one reason or another
-            assert!(
-                result.is_err(),
-                "Expected a parse failure for input: {:?}\nResult: {:?}",
-                self.input,
-                result
-            );
+            // We expect that only part of the input will match or that the entire
+            // input will be rejected outright.
+            if let Ok((_remaining, match_length)) = result {
+                assert_ne!(
+                    match_length,
+                    self.input.len() - 1,
+                    "parser unexpectedly matched the complete input: '{:?}\nResult: {:?}",
+                    self.input,
+                    result
+                );
+            }
         }
     }
 
@@ -1038,13 +1150,54 @@ mod tests {
             r#"
             hello"
             "#,
-            // Missing a trailing quote
+            // Missing a closing quote
             r#"
             "hello
             "#,
+            // Closing quote is escaped
+            r#"
+            "hello\"
+            "#,
         ];
         for input in bad_inputs {
             mismatch_string(input);
         }
     }
+
+    #[test]
+    fn test_match_symbol() {
+        fn match_symbol(input: &str) {
+            MatchTest::new(input).expect_match(match_length(TextBufferView::match_symbol));
+        }
+        fn mismatch_symbol(input: &str) {
+            MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_symbol));
+        }
+
+        // These inputs have leading/trailing whitespace to make them more readable, but the string
+        // matcher doesn't accept whitespace. We'll trim each one before testing it.
+        let good_inputs = &[
+            "'hello'",
+            "'😀😀😀'",
+            "'this has an escaped quote \\' right in the middle'",
+            "$308",
+            "$0",
+            "foo",
+            "name",
+            "$bar",
+            "_baz_quux",
+        ];
+        for input in good_inputs {
+            match_symbol(input);
+        }
+
+        let bad_inputs = &[
+            "'hello",    // No closing quote
+            "'hello\\'", // Closing quote is escaped
+            "$-8",       // Negative SID
+            "nan",       // Identifier that is also a keyword
+        ];
+        for input in bad_inputs {
+            mismatch_symbol(input);
+        }
+    }
 }
diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs
index d994a2bc..873e8f2f 100644
--- a/src/lazy/text/encoded_value.rs
+++ b/src/lazy/text/encoded_value.rs
@@ -116,6 +116,7 @@ impl EncodedTextValue {
             MatchedValue::Int(_) => IonType::Int,
             MatchedValue::Float(_) => IonType::Float,
             MatchedValue::String(_) => IonType::String,
+            MatchedValue::Symbol(_) => IonType::Symbol,
         }
     }
 
diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs
index 8725b9c8..233c5c11 100644
--- a/src/lazy/text/matched.rs
+++ b/src/lazy/text/matched.rs
@@ -19,7 +19,9 @@
 //! use the previously recorded information to minimize the amount of information that needs to be
 //! re-discovered.
 
+use std::borrow::Cow;
 use std::num::IntErrorKind;
+use std::str::FromStr;
 
 use nom::character::is_hex_digit;
 use num_bigint::BigInt;
@@ -31,7 +33,7 @@ use crate::lazy::text::as_utf8::AsUtf8;
 use crate::lazy::text::buffer::TextBufferView;
 use crate::lazy::text::parse_result::InvalidInputError;
 use crate::result::{DecodingError, IonFailure};
-use crate::{Int, IonError, IonResult, IonType};
+use crate::{Int, IonError, IonResult, IonType, RawSymbolTokenRef};
 
 /// A partially parsed Ion value.
 #[derive(Copy, Clone, Debug, PartialEq)]
@@ -42,6 +44,7 @@ pub(crate) enum MatchedValue {
     Int(MatchedInt),
     Float(MatchedFloat),
     String(MatchedString),
+    Symbol(MatchedSymbol),
     // TODO: ...the other types
 }
 
@@ -133,8 +136,6 @@ impl MatchedFloat {
     const STACK_ALLOC_BUFFER_CAPACITY: usize = 32;
 
     pub fn read(&self, matched_input: TextBufferView) -> IonResult<f64> {
-        use std::str::FromStr;
-
         match self {
             MatchedFloat::PositiveInfinity => return Ok(f64::INFINITY),
             MatchedFloat::NegativeInfinity => return Ok(f64::NEG_INFINITY),
@@ -212,140 +213,139 @@ impl MatchedString {
         // that replaces the escaped characters with their corresponding bytes.
         let mut sanitized = Vec::with_capacity(matched_input.len());
 
-        Self::escape_short_string(body, &mut sanitized)?;
+        escape_text(body, &mut sanitized)?;
         let text = String::from_utf8(sanitized).unwrap();
         Ok(StrRef::from(text.to_string()))
     }
+}
 
-    fn escape_short_string(
-        matched_input: TextBufferView,
-        sanitized: &mut Vec<u8>,
-    ) -> IonResult<()> {
-        let mut remaining = matched_input;
-        while !remaining.is_empty() {
-            let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\');
-            remaining = if let Some(escape_offset) = next_escape {
-                // Everything up to the '\' is already clean. Write that slice to 'sanitized'.
-                let already_clean = remaining.slice(0, escape_offset);
-                sanitized.extend_from_slice(already_clean.bytes());
-                // Everything starting from the '\' needs to be evaluated.
-                let contains_escapes = remaining.slice_to_end(escape_offset);
-                Self::write_escaped(contains_escapes, sanitized)?
-            } else {
-                sanitized.extend_from_slice(remaining.bytes());
-                // 'remaining' is now empty
-                remaining.slice_to_end(remaining.len())
-            };
-        }
-
-        Ok(())
+fn escape_text(matched_input: TextBufferView, sanitized: &mut Vec<u8>) -> IonResult<()> {
+    let mut remaining = matched_input;
+    while !remaining.is_empty() {
+        let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\');
+        remaining = if let Some(escape_offset) = next_escape {
+            // Everything up to the '\' is already clean. Write that slice to 'sanitized'.
+            let already_clean = remaining.slice(0, escape_offset);
+            sanitized.extend_from_slice(already_clean.bytes());
+            // Everything starting from the '\' needs to be evaluated.
+            let contains_escapes = remaining.slice_to_end(escape_offset);
+            write_escaped(contains_escapes, sanitized)?
+        } else {
+            sanitized.extend_from_slice(remaining.bytes());
+            // 'remaining' is now empty
+            remaining.slice_to_end(remaining.len())
+        };
     }
 
-    fn write_escaped<'data>(
-        input: TextBufferView<'data>,
-        sanitized: &mut Vec<u8>,
-    ) -> IonResult<TextBufferView<'data>> {
-        // Note that by the time this method has been called, the parser has already confirmed that
-        // there is an appropriate closing delimiter. Thus, if any of the branches below run out of
-        // data, it means that it's a fatal error and not just an Incomplete.
-        debug_assert!(!input.is_empty());
-        debug_assert!(input.bytes()[0] == b'\\');
-        if input.len() == 1 {
-            return Err(IonError::Decoding(
-                DecodingError::new("found an escape ('\\') with no subsequent character")
-                    .with_position(input.offset()),
-            ));
-        }
-        let input_after_escape = input.slice_to_end(2); // After (e.g.) '\x'
-        let escape_id = input.bytes()[1];
-        let substitute = match escape_id {
-            b'n' => b'\n',
-            b'r' => b'\r',
-            b't' => b'\t',
-            b'\\' => b'\\',
-            b'/' => b'/',
-            b'"' => b'"',
-            b'\'' => b'\'',
-            b'?' => b'?',
-            b'0' => 0x00u8, // NUL
-            b'a' => 0x07u8, // alert BEL
-            b'b' => 0x08u8, // backspace
-            b'v' => 0x0Bu8, // vertical tab
-            b'f' => 0x0Cu8, // form feed
-            // If the byte following the '\' is a real newline (that is: 0x0A), we discard it.
-            b'\n' => return Ok(input_after_escape),
-            // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes
-            b'x' => return Self::hex_digits_code_point(2, input_after_escape, sanitized),
-            b'u' => return Self::hex_digits_code_point(4, input_after_escape, sanitized),
-            b'U' => return Self::hex_digits_code_point(8, input_after_escape, sanitized),
-            _ => {
-                return Err(IonError::Decoding(
-                    DecodingError::new(format!("invalid escape sequence '\\{}", escape_id))
-                        .with_position(input.offset()),
-                ))
-            }
-        };
+    Ok(())
+}
 
-        sanitized.push(substitute);
-        Ok(input_after_escape)
+fn write_escaped<'data>(
+    input: TextBufferView<'data>,
+    sanitized: &mut Vec<u8>,
+) -> IonResult<TextBufferView<'data>> {
+    // Note that by the time this method has been called, the parser has already confirmed that
+    // there is an appropriate closing delimiter. Thus, if any of the branches below run out of
+    // data, it means that it's a fatal error and not just an Incomplete.
+    debug_assert!(!input.is_empty());
+    debug_assert!(input.bytes()[0] == b'\\');
+    if input.len() == 1 {
+        return Err(IonError::Decoding(
+            DecodingError::new("found an escape ('\\') with no subsequent character")
+                .with_position(input.offset()),
+        ));
     }
-
-    fn hex_digits_code_point<'data>(
-        num_digits: usize,
-        input: TextBufferView<'data>,
-        sanitized: &mut Vec<u8>,
-    ) -> IonResult<TextBufferView<'data>> {
-        if input.len() < num_digits {
+    let input_after_escape = input.slice_to_end(2); // After (e.g.) '\x'
+    let escape_id = input.bytes()[1];
+    let substitute = match escape_id {
+        b'n' => b'\n',
+        b'r' => b'\r',
+        b't' => b'\t',
+        b'\\' => b'\\',
+        b'/' => b'/',
+        b'"' => b'"',
+        b'\'' => b'\'',
+        b'?' => b'?',
+        b'0' => 0x00u8, // NUL
+        b'a' => 0x07u8, // alert BEL
+        b'b' => 0x08u8, // backspace
+        b'v' => 0x0Bu8, // vertical tab
+        b'f' => 0x0Cu8, // form feed
+        // If the byte following the '\' is a real newline (that is: 0x0A), we discard it.
+        b'\n' => return Ok(input_after_escape),
+        // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes
+        b'x' => return hex_digits_code_point(2, input_after_escape, sanitized),
+        b'u' => return hex_digits_code_point(4, input_after_escape, sanitized),
+        b'U' => return hex_digits_code_point(8, input_after_escape, sanitized),
+        _ => {
             return Err(IonError::Decoding(
-                DecodingError::new(format!(
-                    "found a {}-hex-digit escape sequence with only {} digits",
-                    num_digits,
-                    input.len()
-                ))
-                .with_position(input.offset()),
-            ));
+                DecodingError::new(format!("invalid escape sequence '\\{}", escape_id))
+                    .with_position(input.offset()),
+            ))
         }
+    };
 
-        let hex_digit_bytes = &input.bytes()[..num_digits];
-
-        let all_are_hex_digits = hex_digit_bytes
-            .iter()
-            .take(num_digits)
-            .copied()
-            .all(is_hex_digit);
-        if !all_are_hex_digits {
-            return Err(IonError::Decoding(
-                DecodingError::new(format!(
-                    "found a {}-hex-digit escape sequence that contained an invalid hex digit",
-                    num_digits,
-                ))
-                .with_position(input.offset()),
-            ));
-        }
-        // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail.
-        let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap();
-        let code_point = u32::from_str_radix(hex_digits, 16).unwrap();
-
-        // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another
-        // unicode escape representing the low surrogate has to be next in the input to complete it.
-        // See the docs for this helper function for details. (Note: this will only ever be true for
-        // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a
-        // high surrogate.)
-        if code_point_is_a_high_surrogate(code_point) {
-            todo!("support surrogate pairs")
-        }
+    sanitized.push(substitute);
+    Ok(input_after_escape)
+}
 
-        // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a
-        // surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar
-        // value. We can safely convert it to a `char`.
-        let character = char::from_u32(code_point).unwrap();
-        let utf8_buffer: &mut [u8; 4] = &mut [0; 4];
-        let utf8_encoded = character.encode_utf8(utf8_buffer);
-        sanitized.extend_from_slice(utf8_encoded.as_bytes());
+/// Reads the next `num_digits` bytes from `input` as a `char`, then writes that `char`'s UTF8 bytes
+/// to `sanitized`.
+fn hex_digits_code_point<'data>(
+    num_digits: usize,
+    input: TextBufferView<'data>,
+    sanitized: &mut Vec<u8>,
+) -> IonResult<TextBufferView<'data>> {
+    if input.len() < num_digits {
+        return Err(IonError::Decoding(
+            DecodingError::new(format!(
+                "found a {}-hex-digit escape sequence with only {} digits",
+                num_digits,
+                input.len()
+            ))
+            .with_position(input.offset()),
+        ));
+    }
 
-        // Skip beyond the digits we just processed
-        Ok(input.slice_to_end(num_digits))
+    let hex_digit_bytes = &input.bytes()[..num_digits];
+
+    let all_are_hex_digits = hex_digit_bytes
+        .iter()
+        .take(num_digits)
+        .copied()
+        .all(is_hex_digit);
+    if !all_are_hex_digits {
+        return Err(IonError::Decoding(
+            DecodingError::new(format!(
+                "found a {}-hex-digit escape sequence that contained an invalid hex digit",
+                num_digits,
+            ))
+            .with_position(input.offset()),
+        ));
     }
+    // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail.
+    let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap();
+    let code_point = u32::from_str_radix(hex_digits, 16).unwrap();
+
+    // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another
+    // unicode escape representing the low surrogate has to be next in the input to complete it.
+    // See the docs for this helper function for details. (Note: this will only ever be true for
+    // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a
+    // high surrogate.)
+    if code_point_is_a_high_surrogate(code_point) {
+        todo!("support surrogate pairs")
+    }
+
+    // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a
+    // surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar
+    // value. We can safely convert it to a `char`.
+    let character = char::from_u32(code_point).unwrap();
+    let utf8_buffer: &mut [u8; 4] = &mut [0; 4];
+    let utf8_encoded = character.encode_utf8(utf8_buffer);
+    sanitized.extend_from_slice(utf8_encoded.as_bytes());
+
+    // Skip beyond the digits we just processed
+    Ok(input.slice_to_end(num_digits))
 }
 
 /// Returns `true` if the provided code point is a utf-16 high surrogate.
@@ -373,3 +373,72 @@ impl MatchedString {
 fn code_point_is_a_high_surrogate(value: u32) -> bool {
     (0xD800..=0xDFFF).contains(&value)
 }
+
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub(crate) enum MatchedSymbol {
+    /// A numeric symbol ID (e.g. `$21`)
+    SymbolId,
+    /// The symbol is an unquoted identifier (e.g. `foo`)
+    Identifier,
+    /// The symbol is delimited by single quotes. Holds a `bool` indicating whether the
+    /// matched input contained any escaped bytes.
+    Quoted(bool),
+    // TODO: Operators in S-Expressions
+}
+
+impl MatchedSymbol {
+    pub fn read<'data>(
+        &self,
+        matched_input: TextBufferView<'data>,
+    ) -> IonResult<RawSymbolTokenRef<'data>> {
+        match self {
+            MatchedSymbol::SymbolId => self.read_symbol_id(matched_input),
+            MatchedSymbol::Identifier => self.read_identifier(matched_input),
+            MatchedSymbol::Quoted(contains_escaped_chars) => {
+                self.read_quoted(matched_input, *contains_escaped_chars)
+            }
+        }
+    }
+
+    fn read_quoted<'data>(
+        &self,
+        matched_input: TextBufferView<'data>,
+        contains_escaped_chars: bool,
+    ) -> IonResult<RawSymbolTokenRef<'data>> {
+        // Take a slice of the input that ignores the first and last bytes, which are quotes.
+        let body = matched_input.slice(1, matched_input.len() - 2);
+        if !contains_escaped_chars {
+            // There are no escaped characters, so we can just validate the string in-place.
+            let text = body.as_text()?;
+            let str_ref = RawSymbolTokenRef::Text(text.into());
+            return Ok(str_ref);
+        }
+
+        // Otherwise, there are escaped characters. We need to build a new version of our symbol
+        // that replaces the escaped characters with their corresponding bytes.
+        let mut sanitized = Vec::with_capacity(matched_input.len());
+
+        escape_text(body, &mut sanitized)?;
+        let text = String::from_utf8(sanitized).unwrap();
+        Ok(RawSymbolTokenRef::Text(text.into()))
+    }
+    fn read_identifier<'data>(
+        &self,
+        matched_input: TextBufferView<'data>,
+    ) -> IonResult<RawSymbolTokenRef<'data>> {
+        matched_input
+            .as_text()
+            .map(|t| RawSymbolTokenRef::Text(Cow::Borrowed(t)))
+    }
+    fn read_symbol_id<'data>(
+        &self,
+        matched_input: TextBufferView<'data>,
+    ) -> IonResult<RawSymbolTokenRef<'data>> {
+        // Skip past the first byte, which has to be a `$`.
+        let text = matched_input.slice_to_end(1).as_text()?;
+        // It's not possible for the number parsing to fail because the matcher's rules
+        // guarantee that this string contains only decimal digits.
+        let sid = usize::from_str(text).expect("loading symbol ID as usize");
+        Ok(RawSymbolTokenRef::SymbolId(sid))
+    }
+}
diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs
index 52346b8f..d45a52ce 100644
--- a/src/lazy/text/raw/reader.rs
+++ b/src/lazy/text/raw/reader.rs
@@ -71,11 +71,13 @@ mod tests {
     use super::*;
     use crate::lazy::decoder::LazyRawValue;
     use crate::lazy::raw_value_ref::RawValueRef;
-    use crate::IonType;
+    use crate::{IonType, RawSymbolTokenRef};
 
     #[test]
     fn test_top_level() -> IonResult<()> {
-        let data = r#"
+        let mut data = String::new();
+        data.push_str(
+            r#"
         /*
             This test demonstrates lazily reading top-level values
             of various Ion types. The values are interspersed with
@@ -117,14 +119,30 @@ mod tests {
         "\x48ello, \x77orld!"              // \x 2-digit hex escape
         "\u0048ello, \u0077orld!"          // \u 4-digit hex escape
         "\U00000048ello, \U00000077orld!"  // \U 8-digit hex escape
-        
-        "#;
 
-        // Make a mutable string so we can append some things that require Rust-level escapes
-        let mut data = String::from(data);
+        "#,
+        );
         // Escaped newlines are discarded
         data.push_str("\"Hello,\\\n world!\"");
 
+        data.push_str(
+            r#"
+        // Symbols
+        
+        'foo'
+        'Hello, world!'
+        '😎😎😎'
+        
+        firstName
+        date_of_birth
+        $variable
+        
+        $0
+        $10
+        $733
+        "#,
+        );
+
         fn expect_next<'data>(
             reader: &mut LazyRawTextReader<'data>,
             expected: RawValueRef<'data, TextEncoding>,
@@ -197,6 +215,44 @@ mod tests {
         expect_next(reader, RawValueRef::String("Hello, world!".into()));
         // "\"Hello,\\\n world!\" "
         expect_next(reader, RawValueRef::String("Hello, world!".into()));
+        // 'foo'
+        expect_next(
+            reader,
+            RawValueRef::Symbol(RawSymbolTokenRef::Text("foo".into())),
+        );
+        expect_next(
+            reader,
+            RawValueRef::Symbol(RawSymbolTokenRef::Text("Hello, world!".into())),
+        );
+        expect_next(
+            reader,
+            RawValueRef::Symbol(RawSymbolTokenRef::Text("😎😎😎".into())),
+        );
+        // firstName
+        expect_next(
+            reader,
+            RawValueRef::Symbol(RawSymbolTokenRef::Text("firstName".into())),
+        );
+        // date_of_birth
+        expect_next(
+            reader,
+            RawValueRef::Symbol(RawSymbolTokenRef::Text("date_of_birth".into())),
+        );
+        // $variable
+        expect_next(
+            reader,
+            RawValueRef::Symbol(RawSymbolTokenRef::Text("$variable".into())),
+        );
+        // $0
+        expect_next(reader, RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(0)));
+        // $10
+        expect_next(reader, RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(10)));
+        // $733
+        expect_next(
+            reader,
+            RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(733)),
+        );
+
         Ok(())
     }
 }
diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs
index 8f60346c..d0c4365c 100644
--- a/src/lazy/text/value.rs
+++ b/src/lazy/text/value.rs
@@ -53,6 +53,7 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> {
             MatchedValue::Float(f) => RawValueRef::Float(f.read(matched_input)?),
             // ...decimal, timestamp...
             MatchedValue::String(s) => RawValueRef::String(s.read(matched_input)?),
+            MatchedValue::Symbol(s) => RawValueRef::Symbol(s.read(matched_input)?),
             // ...and the rest!
         };
         Ok(value_ref)
diff --git a/src/lazy/value.rs b/src/lazy/value.rs
index 8f09cdbf..f25caf7e 100644
--- a/src/lazy/value.rs
+++ b/src/lazy/value.rs
@@ -9,6 +9,7 @@ use crate::{
     Annotations, Element, IntoAnnotatedElement, IonError, IonResult, IonType, RawSymbolTokenRef,
     SymbolRef, SymbolTable, Value,
 };
+use std::borrow::Cow;
 
 /// A value in a binary Ion stream whose header has been parsed but whose body (i.e. its data) has
 /// not. A `LazyValue` is immutable; its data can be read any number of times.
@@ -184,7 +185,8 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyValue<'top, 'data, D> {
                             ))
                         })?
                         .into(),
-                    RawSymbolTokenRef::Text(text) => text.into(),
+                    RawSymbolTokenRef::Text(Cow::Borrowed(text)) => text.into(),
+                    RawSymbolTokenRef::Text(Cow::Owned(text)) => text.into(),
                 };
                 ValueRef::Symbol(symbol)
             }
@@ -333,7 +335,7 @@ where
                 )),
                 Some(symbol) => Some(Ok(symbol.into())),
             },
-            Ok(RawSymbolTokenRef::Text(text)) => Some(Ok(SymbolRef::with_text(text))),
+            Ok(RawSymbolTokenRef::Text(text)) => Some(Ok(text.into())),
             Err(e) => Some(Err(e)),
         }
     }
diff --git a/src/raw_symbol_token_ref.rs b/src/raw_symbol_token_ref.rs
index d4a00c4d..dddedc7c 100644
--- a/src/raw_symbol_token_ref.rs
+++ b/src/raw_symbol_token_ref.rs
@@ -1,11 +1,12 @@
 use crate::raw_symbol_token::RawSymbolToken;
 use crate::{Symbol, SymbolId};
+use std::borrow::Cow;
 
 /// Like RawSymbolToken, but the Text variant holds a borrowed reference instead of a String.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum RawSymbolTokenRef<'a> {
     SymbolId(SymbolId),
-    Text(&'a str),
+    Text(Cow<'a, str>),
 }
 
 /// Implemented by types that can be viewed as a [RawSymbolTokenRef] without allocations.
@@ -15,10 +16,7 @@ pub trait AsRawSymbolTokenRef {
 
 impl<'a> AsRawSymbolTokenRef for RawSymbolTokenRef<'a> {
     fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef {
-        match self {
-            RawSymbolTokenRef::SymbolId(sid) => RawSymbolTokenRef::SymbolId(*sid),
-            RawSymbolTokenRef::Text(text) => RawSymbolTokenRef::Text(text),
-        }
+        self.clone()
     }
 }
 
@@ -30,20 +28,20 @@ impl AsRawSymbolTokenRef for SymbolId {
 
 impl AsRawSymbolTokenRef for String {
     fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef {
-        RawSymbolTokenRef::Text(self.as_str())
+        RawSymbolTokenRef::Text(Cow::from(self.as_str()))
     }
 }
 
 impl AsRawSymbolTokenRef for &str {
     fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef {
-        RawSymbolTokenRef::Text(self)
+        RawSymbolTokenRef::Text(Cow::from(*self))
     }
 }
 
 impl AsRawSymbolTokenRef for Symbol {
     fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef {
         match self.text() {
-            Some(text) => RawSymbolTokenRef::Text(text),
+            Some(text) => RawSymbolTokenRef::Text(Cow::from(text)),
             None => RawSymbolTokenRef::SymbolId(0),
         }
     }
@@ -62,7 +60,7 @@ impl AsRawSymbolTokenRef for RawSymbolToken {
     fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef {
         match self {
             RawSymbolToken::SymbolId(sid) => RawSymbolTokenRef::SymbolId(*sid),
-            RawSymbolToken::Text(text) => RawSymbolTokenRef::Text(text.as_str()),
+            RawSymbolToken::Text(text) => RawSymbolTokenRef::Text(Cow::from(text.as_str())),
         }
     }
 }
diff --git a/src/symbol_ref.rs b/src/symbol_ref.rs
index 9cd42cac..815c75fe 100644
--- a/src/symbol_ref.rs
+++ b/src/symbol_ref.rs
@@ -1,5 +1,5 @@
 use crate::Symbol;
-use std::borrow::Borrow;
+use std::borrow::{Borrow, Cow};
 use std::fmt::{Debug, Formatter};
 use std::hash::{Hash, Hasher};
 
@@ -7,19 +7,19 @@ use std::hash::{Hash, Hasher};
 /// static lifetime), a `SymbolRef` may have known or undefined text (i.e. `$0`).
 #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct SymbolRef<'a> {
-    text: Option<&'a str>,
+    text: Option<Cow<'a, str>>,
 }
 
 impl<'a> Debug for SymbolRef<'a> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.text.unwrap_or("$0"))
+        write!(f, "{}", self.text().unwrap_or("$0"))
     }
 }
 
 impl<'a> SymbolRef<'a> {
     /// If this symbol has known text, returns `Some(&str)`. Otherwise, returns `None`.
     pub fn text(&self) -> Option<&str> {
-        self.text
+        self.text.as_ref().map(|t| t.as_ref())
     }
 
     /// Constructs a `SymbolRef` with unknown text.
@@ -28,14 +28,17 @@ impl<'a> SymbolRef<'a> {
     }
 
     /// Constructs a `SymbolRef` with the specified text.
-    pub fn with_text(text: &str) -> SymbolRef {
-        SymbolRef { text: Some(text) }
+    pub fn with_text(text: impl Into<Cow<'a, str>>) -> SymbolRef<'a> {
+        SymbolRef {
+            text: Some(text.into()),
+        }
     }
 
     pub fn to_owned(self) -> Symbol {
-        match self.text() {
+        match self.text {
             None => Symbol::unknown_text(),
-            Some(text) => Symbol::owned(text),
+            Some(Cow::Borrowed(text)) => Symbol::owned(text),
+            Some(Cow::Owned(text)) => Symbol::owned(text),
         }
     }
 }
@@ -60,14 +63,14 @@ pub trait AsSymbolRef {
 impl<'a, A: AsRef<str> + 'a> AsSymbolRef for A {
     fn as_symbol_ref(&self) -> SymbolRef {
         SymbolRef {
-            text: Some(self.as_ref()),
+            text: Some(Cow::Borrowed(self.as_ref())),
         }
     }
 }
 
 impl<'a> Hash for SymbolRef<'a> {
     fn hash<H: Hasher>(&self, state: &mut H) {
-        match self.text {
+        match self.text() {
             None => 0.hash(state),
             Some(text) => text.hash(state),
         }
@@ -76,18 +79,33 @@ impl<'a> Hash for SymbolRef<'a> {
 
 impl<'a> From<&'a str> for SymbolRef<'a> {
     fn from(text: &'a str) -> Self {
-        Self { text: Some(text) }
+        Self {
+            text: Some(Cow::Borrowed(text)),
+        }
     }
 }
 
-impl<'a> From<&'a Symbol> for SymbolRef<'a> {
-    fn from(symbol: &'a Symbol) -> Self {
+impl<'a> From<String> for SymbolRef<'a> {
+    fn from(text: String) -> Self {
         Self {
-            text: symbol.text(),
+            text: Some(Cow::Owned(text)),
         }
     }
 }
 
+impl<'a> From<Cow<'a, str>> for SymbolRef<'a> {
+    fn from(value: Cow<'a, str>) -> Self {
+        Self { text: Some(value) }
+    }
+}
+
+impl<'a> From<&'a Symbol> for SymbolRef<'a> {
+    fn from(symbol: &'a Symbol) -> Self {
+        let text = symbol.text().map(Cow::Borrowed);
+        Self { text }
+    }
+}
+
 // Note that this method panics if the SymbolRef has unknown text! This is unfortunate but is required
 // in order to allow a HashMap<SymbolRef, _> to do lookups with a &str instead of a &SymbolRef
 impl<'a> Borrow<str> for SymbolRef<'a> {
diff --git a/src/text/raw_text_writer.rs b/src/text/raw_text_writer.rs
index b0e75717..68a043a0 100644
--- a/src/text/raw_text_writer.rs
+++ b/src/text/raw_text_writer.rs
@@ -320,12 +320,13 @@ impl<W: Write> RawTextWriter<W> {
         match token.as_raw_symbol_token_ref() {
             RawSymbolTokenRef::SymbolId(sid) => write!(output, "${sid}")?,
             RawSymbolTokenRef::Text(text)
-                if Self::token_is_keyword(text) || Self::token_resembles_symbol_id(text) =>
+                if Self::token_is_keyword(text.as_ref())
+                    || Self::token_resembles_symbol_id(text.as_ref()) =>
             {
                 // Write the symbol text in single quotes
                 write!(output, "'{text}'")?;
             }
-            RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text) => {
+            RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text.as_ref()) => {
                 // Write the symbol text without quotes
                 write!(output, "{text}")?
             }
diff --git a/src/text/text_formatter.rs b/src/text/text_formatter.rs
index 828e9fb5..404d556d 100644
--- a/src/text/text_formatter.rs
+++ b/src/text/text_formatter.rs
@@ -229,12 +229,13 @@ impl<'a, W: std::fmt::Write> IonValueFormatter<'a, W> {
         match token.as_raw_symbol_token_ref() {
             RawSymbolTokenRef::SymbolId(sid) => write!(self.output, "${sid}")?,
             RawSymbolTokenRef::Text(text)
-                if Self::token_is_keyword(text) || Self::token_resembles_symbol_id(text) =>
+                if Self::token_is_keyword(text.as_ref())
+                    || Self::token_resembles_symbol_id(text.as_ref()) =>
             {
                 // Write the symbol text in single quotes
                 write!(self.output, "'{text}'")?;
             }
-            RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text) => {
+            RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text.as_ref()) => {
                 // Write the symbol text without quotes
                 write!(self.output, "{text}")?
             }
diff --git a/src/text/text_writer.rs b/src/text/text_writer.rs
index 7fc140d0..c4829974 100644
--- a/src/text/text_writer.rs
+++ b/src/text/text_writer.rs
@@ -123,7 +123,7 @@ impl<W: Write> IonWriter for TextWriter<W> {
                 RawSymbolTokenRef::SymbolId(symbol_id) => {
                     // Get the text associated with this symbol ID
                     match self.symbol_table.text_for(symbol_id) {
-                        Some(text) => RawSymbolTokenRef::Text(text),
+                        Some(text) => RawSymbolTokenRef::Text(text.into()),
                         None => RawSymbolTokenRef::SymbolId(symbol_id),
                     }
                 }
@@ -138,7 +138,7 @@ impl<W: Write> IonWriter for TextWriter<W> {
             RawSymbolTokenRef::SymbolId(symbol_id) => {
                 // Get the text associated with this symbol ID
                 match self.symbol_table.text_for(symbol_id) {
-                    Some(text) => RawSymbolTokenRef::Text(text),
+                    Some(text) => RawSymbolTokenRef::Text(text.into()),
                     None => RawSymbolTokenRef::SymbolId(symbol_id),
                 }
             }
@@ -152,7 +152,7 @@ impl<W: Write> IonWriter for TextWriter<W> {
             RawSymbolTokenRef::SymbolId(symbol_id) => {
                 // Get the text associated with this symbol ID
                 match self.symbol_table.text_for(symbol_id) {
-                    Some(text) => RawSymbolTokenRef::Text(text),
+                    Some(text) => RawSymbolTokenRef::Text(text.into()),
                     None => RawSymbolTokenRef::SymbolId(symbol_id),
                 }
             }