From 1dcb8a52eebc1957e098ea3ea9a5221ff28c68d8 Mon Sep 17 00:00:00 2001 From: Kneasle Date: Tue, 28 Sep 2021 16:52:24 +0100 Subject: [PATCH] Implement string escaping --- fuzz/src/parser.rs | 26 ++++++------ grammar/src/char_set.rs | 2 +- grammar/src/grammar.rs | 62 ++++++++++++++++++++-------- grammar/src/spec/convert.rs | 79 ++++++++++++++++++++++++------------ grammar/src/spec/mod.rs | 22 +++++++++- grammar/src/tokenizer/mod.rs | 4 +- json.toml | 4 +- 7 files changed, 136 insertions(+), 63 deletions(-) diff --git a/fuzz/src/parser.rs b/fuzz/src/parser.rs index ec4a894..d1a6d17 100644 --- a/fuzz/src/parser.rs +++ b/fuzz/src/parser.rs @@ -102,7 +102,7 @@ impl Default for Config { fn default() -> Self { Config { average_ws_length: 5.0, - average_tree_size: 10.0, + average_tree_size: 2.0, max_stringy_regex_repeats: 15, tree_depth_limit: 15, tree_node_limit: 1_000, @@ -222,18 +222,20 @@ fn gen_elem( } PatternElement::Seq { pattern, delimiter } => { out.push(ast::Elem::SeqStart); - // TODO: Handle depth limiting better than this - if depth < data.tree_depth_limit && state.nodes_generated < data.tree_node_limit { - let mut is_first_node = true; - while state.rng.gen_range(0.0..1.0) > data.one_minus_new_segment_prob { - // Add delimiter between nodes - if !is_first_node { - out.push(ast::Elem::SeqDelim(*delimiter, table.gen_ws(state.rng))); - is_first_node = false; - } - // Add segment - gen_pattern(pattern, out, state, depth); + let mut is_first_node = true; + while state.rng.gen_range(0.0..1.0) > data.one_minus_new_segment_prob { + // TODO: Handle depth limiting better than this + if depth > data.tree_depth_limit || state.nodes_generated > data.tree_node_limit { + break; } + + // Add delimiter between nodes + if !is_first_node { + out.push(ast::Elem::SeqDelim(*delimiter, table.gen_ws(state.rng))); + } + // Add segment + gen_pattern(pattern, out, state, depth); + is_first_node = false; } out.push(ast::Elem::SeqEnd); } diff --git a/grammar/src/char_set.rs b/grammar/src/char_set.rs index cca3f3b..e2eeced 100644 --- a/grammar/src/char_set.rs +++ b/grammar/src/char_set.rs @@ -2,7 +2,7 @@ use std::{convert::identity, iter::FromIterator, ops::RangeInclusive}; use rand::Rng; -const FIRST_NON_ASCII_CHAR: char = '\u{128}'; +const FIRST_NON_ASCII_CHAR: char = '\u{80}'; /// An set of [`char`], optimised for sets where adjacent [`char`]s are very likely to either both /// be included or both excluded. diff --git a/grammar/src/grammar.rs b/grammar/src/grammar.rs index 5edc916..2beb5a8 100644 --- a/grammar/src/grammar.rs +++ b/grammar/src/grammar.rs @@ -1,13 +1,12 @@ use std::{ collections::HashSet, - fmt::{Debug, Formatter}, + fmt::{Debug, Formatter, Write}, }; use bimap::BiMap; use index_vec::{IndexSlice, IndexVec}; use itertools::Itertools; use regex::Regex; -use serde::Deserialize; use crate::{ char_set::{self, CharSet}, @@ -195,13 +194,20 @@ impl Stringy { pub fn create_display_string(&self, contents: &str) -> String { let mut display_str = String::with_capacity(contents.len() + 20); display_str.push_str(&self.delim_start); - for c in contents.chars() { - // TODO: handle escaping - display_str.push(c); + for ch in contents.chars() { + self.write_escaped_char(ch, &mut display_str); } display_str.push_str(&self.delim_end); display_str } + + /// Writes the escaped version of `c` to a [`String`] + pub fn write_escaped_char(&self, ch: char, out: &mut String) { + match &self.escape_rules { + Some(r) => r.write_escaped_char(ch, out), + None => out.push(ch), + } + } } /// The [`Regex`]es required to specify the valid strings of a [`Stringy`] node @@ -212,29 +218,49 @@ pub struct Regexes { pub(crate) anchored_both: Regex, } -#[derive(Debug, Clone, Deserialize)] -#[serde(deny_unknown_fields)] +/// Rules for how `char`s should be escaped +#[derive(Debug, Clone)] pub struct EscapeRules { /// A non-empty string that all escape sequences must start with. For example, in JSON strings /// this is `\` pub(crate) start_sequence: String, - /// Maps escape sequences (to go after `start_sequence`) to the de-escaped [`String`]. For - /// example, for JSON strings this is: + /// Maps chars to their escape sequence (i.e. the string which goes after + /// `self.start_sequence`). For example, for JSON strings this is: /// ```text - /// `\` -> '\\' (i.e. `\\` de-escapes to `\`) - /// `"` -> '"' (i.e. `\"` de-escapes to `"`) - /// `/` -> '/' - /// `n` -> '\n' - /// `t` -> '\t' - /// `b` -> '\u{8}' - /// `f` -> '\u{c}' - /// `r` -> '\r' + /// '\\' <-> "\\" (i.e. `\\` de-escapes to `\`) + /// '"' <-> "\"" (i.e. `\"` de-escapes to `"`) + /// '/' <-> "/" + /// '\n' <-> "n" + /// '\t' <-> "t" + /// '\u{8}' <-> "b" + /// '\u{c}' <-> "f" + /// '\r' <-> "r" /// ``` - pub(crate) rules: BiMap, + pub(crate) rules: BiMap, /// The prefix which takes 4 hex symbols and de-escapes them to that unicode code-point. For /// example, in JSON strings this is `u` (i.e. `\uABCD` would turn into the unicode code point /// `0xABCD`). pub(crate) unicode_hex_4: Option, + /// A set of `char`s which should not be escaped, even if a rule for them exists. + pub(crate) dont_escape: CharSet, +} + +impl EscapeRules { + fn write_escaped_char(&self, ch: char, out: &mut String) { + if self.dont_escape.contains(ch) { + // Char escaping overriden by `self.dont_escape` + out.push(ch); + } else if let Some(escape_seq) = self.rules.get_by_left(&ch) { + // Char can be escaped using a specific escape sequence + out.push_str(&self.start_sequence); + out.push_str(escape_seq); + } else if let Some(unicode_hex_4_prefix) = &self.unicode_hex_4 { + // If 4-digit hex encoding is defined, then it's valid for all chars + out.push_str(&self.start_sequence); + out.push_str(unicode_hex_4_prefix); + write!(out, "{:04X}", ch as u32).unwrap(); + } + } } ////////////// diff --git a/grammar/src/spec/convert.rs b/grammar/src/spec/convert.rs index 73903fc..f5e7170 100644 --- a/grammar/src/spec/convert.rs +++ b/grammar/src/spec/convert.rs @@ -156,35 +156,19 @@ fn convert_type( } => { assert!(stringy); // stringy should always be set to `true` - let validity_regex = validity_regex + let regexes = validity_regex // Compile two copies of the regex - .map(|regex_str| { - macro_rules! compile_regex { - ($string: expr) => { - Regex::new(&$string).map_err(|inner| ConvertError::Regex { - type_name: name.to_owned(), - regex: $string, - inner, - })?; - }; - } - - let str_unanchored = format!("(?x: {} )", regex_str); - let str_anchor_start = format!("^{}", str_unanchored); - let str_anchor_both = format!("^{}$", str_unanchored); - - Ok(grammar::Regexes { - unanchored: compile_regex!(str_unanchored), - anchored_start: compile_regex!(str_anchor_start), - anchored_both: compile_regex!(str_anchor_both), - }) - }) - // Convert the `Option>` into a `Result, E>` + .map(|regex_str| convert_validity_regex(®ex_str, &name)) + // Use `?` on the `Result` inside the `Option`. I.e. convert a + // `Option>` to `Option`, returning `Err(E)` if needed + .transpose()?; + let escape_rules = escape_rules + .map(|rules| convert_escape_rules(rules)) .transpose()?; let inner = grammar::Stringy { delim_start, delim_end, - regexes: validity_regex, + regexes, default_content, escape_rules, }; @@ -208,6 +192,21 @@ fn convert_type( }) } +fn convert_escape_rules(rules: super::EscapeRules) -> ConvertResult { + let super::EscapeRules { + start_sequence, + rules, + unicode_hex_4, + dont_escape, + } = rules; + Ok(grammar::EscapeRules { + start_sequence, + rules, + unicode_hex_4, + dont_escape: convert_char_set(dont_escape)?, + }) +} + ////////////////////// // TYPE DESCENDANTS // ////////////////////// @@ -287,9 +286,9 @@ fn enumerate_type_descendants( Ok(()) } -///////////////////////// -// PATTERNS/WHITESPACE // -///////////////////////// +////////////// +// PATTERNS // +////////////// fn compile_pattern( elems: super::Pattern, @@ -321,6 +320,32 @@ fn compile_pattern_element( }) } +/////////////////////////////// +// REGEX/WHITESPACE/CHAR SET // +/////////////////////////////// + +fn convert_validity_regex(regex_str: &str, type_name: &str) -> ConvertResult { + macro_rules! compile_regex { + ($string: expr) => { + Regex::new(&$string).map_err(|inner| ConvertError::Regex { + type_name: type_name.to_owned(), + regex: $string, + inner, + })?; + }; + } + + let str_unanchored = format!("(?x: {} )", regex_str); + let str_anchor_start = format!("^{}", str_unanchored); + let str_anchor_both = format!("^{}$", str_unanchored); + + Ok(grammar::Regexes { + unanchored: compile_regex!(str_unanchored), + anchored_start: compile_regex!(str_anchor_start), + anchored_both: compile_regex!(str_anchor_both), + }) +} + fn convert_whitespace(ws_chars: super::CharSet) -> ConvertResult { convert_char_set(ws_chars).map(grammar::Whitespace::from) } diff --git a/grammar/src/spec/mod.rs b/grammar/src/spec/mod.rs index 0cc7e04..33a22f9 100644 --- a/grammar/src/spec/mod.rs +++ b/grammar/src/spec/mod.rs @@ -12,9 +12,10 @@ pub(crate) mod convert; use std::collections::HashMap; +use bimap::BiMap; use serde::Deserialize; -use crate::{grammar, Grammar}; +use crate::Grammar; use self::convert::ConvertResult; @@ -83,7 +84,7 @@ pub(crate) enum Type { validity_regex: Option, #[serde(rename = "escape")] - escape_rules: Option, + escape_rules: Option, }, } @@ -114,6 +115,23 @@ pub(crate) enum PatternElement { }, } +/// See [`grammar::EscapeRules`] for docs. +#[derive(Debug, Clone, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct EscapeRules { + /// A non-empty string that all escape sequences must start with. For example, in JSON strings + /// this is `\` + pub(crate) start_sequence: String, + /// See [`grammar::EscapeRules::rules`] for docs. + pub(crate) rules: BiMap, + /// The prefix which takes 4 hex symbols and de-escapes them to that unicode code-point. For + /// example, in JSON strings this is `u` (i.e. `\uABCD` would turn into the unicode code point + /// `0xABCD`). + pub(crate) unicode_hex_4: Option, + /// A set of `char`s which should not be escaped, even if a rule for them exists. + pub(crate) dont_escape: self::CharSet, +} + /// A set of `char`s, expressed as the contents of `[`, `]` in a regex (e.g. `a-zA-Z` will /// correspond to the regex `[a-zA-Z]`). #[derive(Debug, Clone, Deserialize)] diff --git a/grammar/src/tokenizer/mod.rs b/grammar/src/tokenizer/mod.rs index fea318b..cdc9617 100644 --- a/grammar/src/tokenizer/mod.rs +++ b/grammar/src/tokenizer/mod.rs @@ -235,9 +235,9 @@ fn eat_stringy<'s>( // Check if we have an escape string if ch == esc_start_char { // Consume explicit escapes if needed - for (escaped, content) in &esc_rules.rules { + for (content, escaped) in &esc_rules.rules { if iter.eat(escaped) { - contents.push_str(content); + contents.push(*content); continue 'outer; } } diff --git a/json.toml b/json.toml index 3e2a347..2bb8431 100644 --- a/json.toml +++ b/json.toml @@ -50,5 +50,7 @@ default = "" escape.start_sequence = '\' # `\b` is backspace, `\f` is form feed -escape.rules = { '"'='"', '\'='\', '/'='/', 'b'="\b", 'f'="\f", 'r'="\r", 'n'="\n", 't'="\t" } +escape.rules = { '"'='"', '\'='\', '/'='/', "\b"='b', "\f"='f', "\r"='r', "\n"='n', "\t"='t' } escape.unicode_hex_4 = 'u' # `\uWXYZ` -> `` +escape.dont_escape = '[[:print:]]--["\\]' # Always escape quote and backslash, but don't escape any + # other printable char