From 1dcb8a52eebc1957e098ea3ea9a5221ff28c68d8 Mon Sep 17 00:00:00 2001
From: Kneasle <kneasle@gmail.com>
Date: Tue, 28 Sep 2021 16:52:24 +0100
Subject: [PATCH] Implement string escaping

---
 fuzz/src/parser.rs           | 26 ++++++------
 grammar/src/char_set.rs      |  2 +-
 grammar/src/grammar.rs       | 62 ++++++++++++++++++++--------
 grammar/src/spec/convert.rs  | 79 ++++++++++++++++++++++++------------
 grammar/src/spec/mod.rs      | 22 +++++++++-
 grammar/src/tokenizer/mod.rs |  4 +-
 json.toml                    |  4 +-
 7 files changed, 136 insertions(+), 63 deletions(-)

diff --git a/fuzz/src/parser.rs b/fuzz/src/parser.rs
index ec4a894..d1a6d17 100644
--- a/fuzz/src/parser.rs
+++ b/fuzz/src/parser.rs
@@ -102,7 +102,7 @@ impl Default for Config {
     fn default() -> Self {
         Config {
             average_ws_length: 5.0,
-            average_tree_size: 10.0,
+            average_tree_size: 2.0,
             max_stringy_regex_repeats: 15,
             tree_depth_limit: 15,
             tree_node_limit: 1_000,
@@ -222,18 +222,20 @@ fn gen_elem(
         }
         PatternElement::Seq { pattern, delimiter } => {
             out.push(ast::Elem::SeqStart);
-            // TODO: Handle depth limiting better than this
-            if depth < data.tree_depth_limit && state.nodes_generated < data.tree_node_limit {
-                let mut is_first_node = true;
-                while state.rng.gen_range(0.0..1.0) > data.one_minus_new_segment_prob {
-                    // Add delimiter between nodes
-                    if !is_first_node {
-                        out.push(ast::Elem::SeqDelim(*delimiter, table.gen_ws(state.rng)));
-                        is_first_node = false;
-                    }
-                    // Add segment
-                    gen_pattern(pattern, out, state, depth);
+            let mut is_first_node = true;
+            while state.rng.gen_range(0.0..1.0) > data.one_minus_new_segment_prob {
+                // TODO: Handle depth limiting better than this
+                if depth > data.tree_depth_limit || state.nodes_generated > data.tree_node_limit {
+                    break;
                 }
+
+                // Add delimiter between nodes
+                if !is_first_node {
+                    out.push(ast::Elem::SeqDelim(*delimiter, table.gen_ws(state.rng)));
+                }
+                // Add segment
+                gen_pattern(pattern, out, state, depth);
+                is_first_node = false;
             }
             out.push(ast::Elem::SeqEnd);
         }
diff --git a/grammar/src/char_set.rs b/grammar/src/char_set.rs
index cca3f3b..e2eeced 100644
--- a/grammar/src/char_set.rs
+++ b/grammar/src/char_set.rs
@@ -2,7 +2,7 @@ use std::{convert::identity, iter::FromIterator, ops::RangeInclusive};
 
 use rand::Rng;
 
-const FIRST_NON_ASCII_CHAR: char = '\u{128}';
+const FIRST_NON_ASCII_CHAR: char = '\u{80}';
 
 /// An set of [`char`], optimised for sets where adjacent [`char`]s are very likely to either both
 /// be included or both excluded.
diff --git a/grammar/src/grammar.rs b/grammar/src/grammar.rs
index 5edc916..2beb5a8 100644
--- a/grammar/src/grammar.rs
+++ b/grammar/src/grammar.rs
@@ -1,13 +1,12 @@
 use std::{
     collections::HashSet,
-    fmt::{Debug, Formatter},
+    fmt::{Debug, Formatter, Write},
 };
 
 use bimap::BiMap;
 use index_vec::{IndexSlice, IndexVec};
 use itertools::Itertools;
 use regex::Regex;
-use serde::Deserialize;
 
 use crate::{
     char_set::{self, CharSet},
@@ -195,13 +194,20 @@ impl Stringy {
     pub fn create_display_string(&self, contents: &str) -> String {
         let mut display_str = String::with_capacity(contents.len() + 20);
         display_str.push_str(&self.delim_start);
-        for c in contents.chars() {
-            // TODO: handle escaping
-            display_str.push(c);
+        for ch in contents.chars() {
+            self.write_escaped_char(ch, &mut display_str);
         }
         display_str.push_str(&self.delim_end);
         display_str
     }
+
+    /// Writes the escaped version of `c` to a [`String`]
+    pub fn write_escaped_char(&self, ch: char, out: &mut String) {
+        match &self.escape_rules {
+            Some(r) => r.write_escaped_char(ch, out),
+            None => out.push(ch),
+        }
+    }
 }
 
 /// The [`Regex`]es required to specify the valid strings of a [`Stringy`] node
@@ -212,29 +218,49 @@ pub struct Regexes {
     pub(crate) anchored_both: Regex,
 }
 
-#[derive(Debug, Clone, Deserialize)]
-#[serde(deny_unknown_fields)]
+/// Rules for how `char`s should be escaped
+#[derive(Debug, Clone)]
 pub struct EscapeRules {
     /// A non-empty string that all escape sequences must start with.  For example, in JSON strings
     /// this is `\`
     pub(crate) start_sequence: String,
-    /// Maps escape sequences (to go after `start_sequence`) to the de-escaped [`String`].  For
-    /// example, for JSON strings this is:
+    /// Maps chars to their escape sequence (i.e. the string which goes after
+    /// `self.start_sequence`).  For example, for JSON strings this is:
     /// ```text
-    /// `\` -> '\\' (i.e. `\\` de-escapes to `\`)
-    /// `"` -> '"'  (i.e. `\"` de-escapes to `"`)
-    /// `/` -> '/'
-    /// `n` -> '\n'
-    /// `t` -> '\t'
-    /// `b` -> '\u{8}'
-    /// `f` -> '\u{c}'
-    /// `r` -> '\r'
+    /// '\\'    <-> "\\" (i.e. `\\` de-escapes to `\`)
+    /// '"'     <-> "\""  (i.e. `\"` de-escapes to `"`)
+    /// '/'     <-> "/"
+    /// '\n'    <-> "n"
+    /// '\t'    <-> "t"
+    /// '\u{8}' <-> "b"
+    /// '\u{c}' <-> "f"
+    /// '\r'    <-> "r"
     /// ```
-    pub(crate) rules: BiMap<String, String>,
+    pub(crate) rules: BiMap<char, String>,
     /// The prefix which takes 4 hex symbols and de-escapes them to that unicode code-point.  For
     /// example, in JSON strings this is `u` (i.e. `\uABCD` would turn into the unicode code point
     /// `0xABCD`).
     pub(crate) unicode_hex_4: Option<String>,
+    /// A set of `char`s which should not be escaped, even if a rule for them exists.
+    pub(crate) dont_escape: CharSet,
+}
+
+impl EscapeRules {
+    fn write_escaped_char(&self, ch: char, out: &mut String) {
+        if self.dont_escape.contains(ch) {
+            // Char escaping overriden by `self.dont_escape`
+            out.push(ch);
+        } else if let Some(escape_seq) = self.rules.get_by_left(&ch) {
+            // Char can be escaped using a specific escape sequence
+            out.push_str(&self.start_sequence);
+            out.push_str(escape_seq);
+        } else if let Some(unicode_hex_4_prefix) = &self.unicode_hex_4 {
+            // If 4-digit hex encoding is defined, then it's valid for all chars
+            out.push_str(&self.start_sequence);
+            out.push_str(unicode_hex_4_prefix);
+            write!(out, "{:04X}", ch as u32).unwrap();
+        }
+    }
 }
 
 //////////////
diff --git a/grammar/src/spec/convert.rs b/grammar/src/spec/convert.rs
index 73903fc..f5e7170 100644
--- a/grammar/src/spec/convert.rs
+++ b/grammar/src/spec/convert.rs
@@ -156,35 +156,19 @@ fn convert_type(
         } => {
             assert!(stringy); // stringy should always be set to `true`
 
-            let validity_regex = validity_regex
+            let regexes = validity_regex
                 // Compile two copies of the regex
-                .map(|regex_str| {
-                    macro_rules! compile_regex {
-                        ($string: expr) => {
-                            Regex::new(&$string).map_err(|inner| ConvertError::Regex {
-                                type_name: name.to_owned(),
-                                regex: $string,
-                                inner,
-                            })?;
-                        };
-                    }
-
-                    let str_unanchored = format!("(?x: {} )", regex_str);
-                    let str_anchor_start = format!("^{}", str_unanchored);
-                    let str_anchor_both = format!("^{}$", str_unanchored);
-
-                    Ok(grammar::Regexes {
-                        unanchored: compile_regex!(str_unanchored),
-                        anchored_start: compile_regex!(str_anchor_start),
-                        anchored_both: compile_regex!(str_anchor_both),
-                    })
-                })
-                // Convert the `Option<Result<R, E>>` into a `Result<Option<R>, E>`
+                .map(|regex_str| convert_validity_regex(&regex_str, &name))
+                // Use `?` on the `Result` inside the `Option`.  I.e. convert a
+                // `Option<Result<T, E>>` to `Option<T>`, returning `Err(E)` if needed
+                .transpose()?;
+            let escape_rules = escape_rules
+                .map(|rules| convert_escape_rules(rules))
                 .transpose()?;
             let inner = grammar::Stringy {
                 delim_start,
                 delim_end,
-                regexes: validity_regex,
+                regexes,
                 default_content,
                 escape_rules,
             };
@@ -208,6 +192,21 @@ fn convert_type(
     })
 }
 
+fn convert_escape_rules(rules: super::EscapeRules) -> ConvertResult<grammar::EscapeRules> {
+    let super::EscapeRules {
+        start_sequence,
+        rules,
+        unicode_hex_4,
+        dont_escape,
+    } = rules;
+    Ok(grammar::EscapeRules {
+        start_sequence,
+        rules,
+        unicode_hex_4,
+        dont_escape: convert_char_set(dont_escape)?,
+    })
+}
+
 //////////////////////
 // TYPE DESCENDANTS //
 //////////////////////
@@ -287,9 +286,9 @@ fn enumerate_type_descendants(
     Ok(())
 }
 
-/////////////////////////
-// PATTERNS/WHITESPACE //
-/////////////////////////
+//////////////
+// PATTERNS //
+//////////////
 
 fn compile_pattern(
     elems: super::Pattern,
@@ -321,6 +320,32 @@ fn compile_pattern_element(
     })
 }
 
+///////////////////////////////
+// REGEX/WHITESPACE/CHAR SET //
+///////////////////////////////
+
+fn convert_validity_regex(regex_str: &str, type_name: &str) -> ConvertResult<grammar::Regexes> {
+    macro_rules! compile_regex {
+        ($string: expr) => {
+            Regex::new(&$string).map_err(|inner| ConvertError::Regex {
+                type_name: type_name.to_owned(),
+                regex: $string,
+                inner,
+            })?;
+        };
+    }
+
+    let str_unanchored = format!("(?x: {} )", regex_str);
+    let str_anchor_start = format!("^{}", str_unanchored);
+    let str_anchor_both = format!("^{}$", str_unanchored);
+
+    Ok(grammar::Regexes {
+        unanchored: compile_regex!(str_unanchored),
+        anchored_start: compile_regex!(str_anchor_start),
+        anchored_both: compile_regex!(str_anchor_both),
+    })
+}
+
 fn convert_whitespace(ws_chars: super::CharSet) -> ConvertResult<grammar::Whitespace> {
     convert_char_set(ws_chars).map(grammar::Whitespace::from)
 }
diff --git a/grammar/src/spec/mod.rs b/grammar/src/spec/mod.rs
index 0cc7e04..33a22f9 100644
--- a/grammar/src/spec/mod.rs
+++ b/grammar/src/spec/mod.rs
@@ -12,9 +12,10 @@ pub(crate) mod convert;
 
 use std::collections::HashMap;
 
+use bimap::BiMap;
 use serde::Deserialize;
 
-use crate::{grammar, Grammar};
+use crate::Grammar;
 
 use self::convert::ConvertResult;
 
@@ -83,7 +84,7 @@ pub(crate) enum Type {
         validity_regex: Option<String>,
 
         #[serde(rename = "escape")]
-        escape_rules: Option<grammar::EscapeRules>,
+        escape_rules: Option<self::EscapeRules>,
     },
 }
 
@@ -114,6 +115,23 @@ pub(crate) enum PatternElement {
     },
 }
 
+/// See [`grammar::EscapeRules`] for docs.
+#[derive(Debug, Clone, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct EscapeRules {
+    /// A non-empty string that all escape sequences must start with.  For example, in JSON strings
+    /// this is `\`
+    pub(crate) start_sequence: String,
+    /// See [`grammar::EscapeRules::rules`] for docs.
+    pub(crate) rules: BiMap<char, String>,
+    /// The prefix which takes 4 hex symbols and de-escapes them to that unicode code-point.  For
+    /// example, in JSON strings this is `u` (i.e. `\uABCD` would turn into the unicode code point
+    /// `0xABCD`).
+    pub(crate) unicode_hex_4: Option<String>,
+    /// A set of `char`s which should not be escaped, even if a rule for them exists.
+    pub(crate) dont_escape: self::CharSet,
+}
+
 /// A set of `char`s, expressed as the contents of `[`, `]` in a regex (e.g. `a-zA-Z` will
 /// correspond to the regex `[a-zA-Z]`).
 #[derive(Debug, Clone, Deserialize)]
diff --git a/grammar/src/tokenizer/mod.rs b/grammar/src/tokenizer/mod.rs
index fea318b..cdc9617 100644
--- a/grammar/src/tokenizer/mod.rs
+++ b/grammar/src/tokenizer/mod.rs
@@ -235,9 +235,9 @@ fn eat_stringy<'s>(
                 // Check if we have an escape string
                 if ch == esc_start_char {
                     // Consume explicit escapes if needed
-                    for (escaped, content) in &esc_rules.rules {
+                    for (content, escaped) in &esc_rules.rules {
                         if iter.eat(escaped) {
-                            contents.push_str(content);
+                            contents.push(*content);
                             continue 'outer;
                         }
                     }
diff --git a/json.toml b/json.toml
index 3e2a347..2bb8431 100644
--- a/json.toml
+++ b/json.toml
@@ -50,5 +50,7 @@ default = ""
 
 escape.start_sequence = '\'
 # `\b` is backspace, `\f` is form feed
-escape.rules = { '"'='"', '\'='\', '/'='/', 'b'="\b", 'f'="\f", 'r'="\r", 'n'="\n", 't'="\t" }
+escape.rules = { '"'='"', '\'='\', '/'='/', "\b"='b', "\f"='f', "\r"='r', "\n"='n', "\t"='t' }
 escape.unicode_hex_4 = 'u' # `\uWXYZ` -> `<unicode codepoint 0xWXYZ>`
+escape.dont_escape = '[[:print:]]--["\\]' # Always escape quote and backslash, but don't escape any
+                                          # other printable char