Skip to content

Commit

Permalink
Implement string escaping
Browse files Browse the repository at this point in the history
  • Loading branch information
kneasle committed Sep 28, 2021
1 parent db6fdcb commit 1dcb8a5
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 63 deletions.
26 changes: 14 additions & 12 deletions fuzz/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ impl Default for Config {
fn default() -> Self {
Config {
average_ws_length: 5.0,
average_tree_size: 10.0,
average_tree_size: 2.0,
max_stringy_regex_repeats: 15,
tree_depth_limit: 15,
tree_node_limit: 1_000,
Expand Down Expand Up @@ -222,18 +222,20 @@ fn gen_elem(
}
PatternElement::Seq { pattern, delimiter } => {
out.push(ast::Elem::SeqStart);
// TODO: Handle depth limiting better than this
if depth < data.tree_depth_limit && state.nodes_generated < data.tree_node_limit {
let mut is_first_node = true;
while state.rng.gen_range(0.0..1.0) > data.one_minus_new_segment_prob {
// Add delimiter between nodes
if !is_first_node {
out.push(ast::Elem::SeqDelim(*delimiter, table.gen_ws(state.rng)));
is_first_node = false;
}
// Add segment
gen_pattern(pattern, out, state, depth);
let mut is_first_node = true;
while state.rng.gen_range(0.0..1.0) > data.one_minus_new_segment_prob {
// TODO: Handle depth limiting better than this
if depth > data.tree_depth_limit || state.nodes_generated > data.tree_node_limit {
break;
}

// Add delimiter between nodes
if !is_first_node {
out.push(ast::Elem::SeqDelim(*delimiter, table.gen_ws(state.rng)));
}
// Add segment
gen_pattern(pattern, out, state, depth);
is_first_node = false;
}
out.push(ast::Elem::SeqEnd);
}
Expand Down
2 changes: 1 addition & 1 deletion grammar/src/char_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::{convert::identity, iter::FromIterator, ops::RangeInclusive};

use rand::Rng;

const FIRST_NON_ASCII_CHAR: char = '\u{128}';
const FIRST_NON_ASCII_CHAR: char = '\u{80}';

/// An set of [`char`], optimised for sets where adjacent [`char`]s are very likely to either both
/// be included or both excluded.
Expand Down
62 changes: 44 additions & 18 deletions grammar/src/grammar.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
use std::{
collections::HashSet,
fmt::{Debug, Formatter},
fmt::{Debug, Formatter, Write},
};

use bimap::BiMap;
use index_vec::{IndexSlice, IndexVec};
use itertools::Itertools;
use regex::Regex;
use serde::Deserialize;

use crate::{
char_set::{self, CharSet},
Expand Down Expand Up @@ -195,13 +194,20 @@ impl Stringy {
pub fn create_display_string(&self, contents: &str) -> String {
let mut display_str = String::with_capacity(contents.len() + 20);
display_str.push_str(&self.delim_start);
for c in contents.chars() {
// TODO: handle escaping
display_str.push(c);
for ch in contents.chars() {
self.write_escaped_char(ch, &mut display_str);
}
display_str.push_str(&self.delim_end);
display_str
}

/// Writes the escaped version of `c` to a [`String`]
pub fn write_escaped_char(&self, ch: char, out: &mut String) {
match &self.escape_rules {
Some(r) => r.write_escaped_char(ch, out),
None => out.push(ch),
}
}
}

/// The [`Regex`]es required to specify the valid strings of a [`Stringy`] node
Expand All @@ -212,29 +218,49 @@ pub struct Regexes {
pub(crate) anchored_both: Regex,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
/// Rules for how `char`s should be escaped
#[derive(Debug, Clone)]
pub struct EscapeRules {
/// A non-empty string that all escape sequences must start with. For example, in JSON strings
/// this is `\`
pub(crate) start_sequence: String,
/// Maps escape sequences (to go after `start_sequence`) to the de-escaped [`String`]. For
/// example, for JSON strings this is:
/// Maps chars to their escape sequence (i.e. the string which goes after
/// `self.start_sequence`). For example, for JSON strings this is:
/// ```text
/// `\` -> '\\' (i.e. `\\` de-escapes to `\`)
/// `"` -> '"' (i.e. `\"` de-escapes to `"`)
/// `/` -> '/'
/// `n` -> '\n'
/// `t` -> '\t'
/// `b` -> '\u{8}'
/// `f` -> '\u{c}'
/// `r` -> '\r'
/// '\\' <-> "\\" (i.e. `\\` de-escapes to `\`)
/// '"' <-> "\"" (i.e. `\"` de-escapes to `"`)
/// '/' <-> "/"
/// '\n' <-> "n"
/// '\t' <-> "t"
/// '\u{8}' <-> "b"
/// '\u{c}' <-> "f"
/// '\r' <-> "r"
/// ```
pub(crate) rules: BiMap<String, String>,
pub(crate) rules: BiMap<char, String>,
/// The prefix which takes 4 hex symbols and de-escapes them to that unicode code-point. For
/// example, in JSON strings this is `u` (i.e. `\uABCD` would turn into the unicode code point
/// `0xABCD`).
pub(crate) unicode_hex_4: Option<String>,
/// A set of `char`s which should not be escaped, even if a rule for them exists.
pub(crate) dont_escape: CharSet,
}

impl EscapeRules {
fn write_escaped_char(&self, ch: char, out: &mut String) {
if self.dont_escape.contains(ch) {
// Char escaping overriden by `self.dont_escape`
out.push(ch);
} else if let Some(escape_seq) = self.rules.get_by_left(&ch) {
// Char can be escaped using a specific escape sequence
out.push_str(&self.start_sequence);
out.push_str(escape_seq);
} else if let Some(unicode_hex_4_prefix) = &self.unicode_hex_4 {
// If 4-digit hex encoding is defined, then it's valid for all chars
out.push_str(&self.start_sequence);
out.push_str(unicode_hex_4_prefix);
write!(out, "{:04X}", ch as u32).unwrap();
}
}
}

//////////////
Expand Down
79 changes: 52 additions & 27 deletions grammar/src/spec/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,35 +156,19 @@ fn convert_type(
} => {
assert!(stringy); // stringy should always be set to `true`

let validity_regex = validity_regex
let regexes = validity_regex
// Compile two copies of the regex
.map(|regex_str| {
macro_rules! compile_regex {
($string: expr) => {
Regex::new(&$string).map_err(|inner| ConvertError::Regex {
type_name: name.to_owned(),
regex: $string,
inner,
})?;
};
}

let str_unanchored = format!("(?x: {} )", regex_str);
let str_anchor_start = format!("^{}", str_unanchored);
let str_anchor_both = format!("^{}$", str_unanchored);

Ok(grammar::Regexes {
unanchored: compile_regex!(str_unanchored),
anchored_start: compile_regex!(str_anchor_start),
anchored_both: compile_regex!(str_anchor_both),
})
})
// Convert the `Option<Result<R, E>>` into a `Result<Option<R>, E>`
.map(|regex_str| convert_validity_regex(&regex_str, &name))
// Use `?` on the `Result` inside the `Option`. I.e. convert a
// `Option<Result<T, E>>` to `Option<T>`, returning `Err(E)` if needed
.transpose()?;
let escape_rules = escape_rules
.map(|rules| convert_escape_rules(rules))
.transpose()?;
let inner = grammar::Stringy {
delim_start,
delim_end,
regexes: validity_regex,
regexes,
default_content,
escape_rules,
};
Expand All @@ -208,6 +192,21 @@ fn convert_type(
})
}

fn convert_escape_rules(rules: super::EscapeRules) -> ConvertResult<grammar::EscapeRules> {
let super::EscapeRules {
start_sequence,
rules,
unicode_hex_4,
dont_escape,
} = rules;
Ok(grammar::EscapeRules {
start_sequence,
rules,
unicode_hex_4,
dont_escape: convert_char_set(dont_escape)?,
})
}

//////////////////////
// TYPE DESCENDANTS //
//////////////////////
Expand Down Expand Up @@ -287,9 +286,9 @@ fn enumerate_type_descendants(
Ok(())
}

/////////////////////////
// PATTERNS/WHITESPACE //
/////////////////////////
//////////////
// PATTERNS //
//////////////

fn compile_pattern(
elems: super::Pattern,
Expand Down Expand Up @@ -321,6 +320,32 @@ fn compile_pattern_element(
})
}

///////////////////////////////
// REGEX/WHITESPACE/CHAR SET //
///////////////////////////////

fn convert_validity_regex(regex_str: &str, type_name: &str) -> ConvertResult<grammar::Regexes> {
macro_rules! compile_regex {
($string: expr) => {
Regex::new(&$string).map_err(|inner| ConvertError::Regex {
type_name: type_name.to_owned(),
regex: $string,
inner,
})?;
};
}

let str_unanchored = format!("(?x: {} )", regex_str);
let str_anchor_start = format!("^{}", str_unanchored);
let str_anchor_both = format!("^{}$", str_unanchored);

Ok(grammar::Regexes {
unanchored: compile_regex!(str_unanchored),
anchored_start: compile_regex!(str_anchor_start),
anchored_both: compile_regex!(str_anchor_both),
})
}

fn convert_whitespace(ws_chars: super::CharSet) -> ConvertResult<grammar::Whitespace> {
convert_char_set(ws_chars).map(grammar::Whitespace::from)
}
Expand Down
22 changes: 20 additions & 2 deletions grammar/src/spec/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ pub(crate) mod convert;

use std::collections::HashMap;

use bimap::BiMap;
use serde::Deserialize;

use crate::{grammar, Grammar};
use crate::Grammar;

use self::convert::ConvertResult;

Expand Down Expand Up @@ -83,7 +84,7 @@ pub(crate) enum Type {
validity_regex: Option<String>,

#[serde(rename = "escape")]
escape_rules: Option<grammar::EscapeRules>,
escape_rules: Option<self::EscapeRules>,
},
}

Expand Down Expand Up @@ -114,6 +115,23 @@ pub(crate) enum PatternElement {
},
}

/// See [`grammar::EscapeRules`] for docs.
#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct EscapeRules {
/// A non-empty string that all escape sequences must start with. For example, in JSON strings
/// this is `\`
pub(crate) start_sequence: String,
/// See [`grammar::EscapeRules::rules`] for docs.
pub(crate) rules: BiMap<char, String>,
/// The prefix which takes 4 hex symbols and de-escapes them to that unicode code-point. For
/// example, in JSON strings this is `u` (i.e. `\uABCD` would turn into the unicode code point
/// `0xABCD`).
pub(crate) unicode_hex_4: Option<String>,
/// A set of `char`s which should not be escaped, even if a rule for them exists.
pub(crate) dont_escape: self::CharSet,
}

/// A set of `char`s, expressed as the contents of `[`, `]` in a regex (e.g. `a-zA-Z` will
/// correspond to the regex `[a-zA-Z]`).
#[derive(Debug, Clone, Deserialize)]
Expand Down
4 changes: 2 additions & 2 deletions grammar/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,9 @@ fn eat_stringy<'s>(
// Check if we have an escape string
if ch == esc_start_char {
// Consume explicit escapes if needed
for (escaped, content) in &esc_rules.rules {
for (content, escaped) in &esc_rules.rules {
if iter.eat(escaped) {
contents.push_str(content);
contents.push(*content);
continue 'outer;
}
}
Expand Down
4 changes: 3 additions & 1 deletion json.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,7 @@ default = ""

escape.start_sequence = '\'
# `\b` is backspace, `\f` is form feed
escape.rules = { '"'='"', '\'='\', '/'='/', 'b'="\b", 'f'="\f", 'r'="\r", 'n'="\n", 't'="\t" }
escape.rules = { '"'='"', '\'='\', '/'='/', "\b"='b', "\f"='f', "\r"='r', "\n"='n', "\t"='t' }
escape.unicode_hex_4 = 'u' # `\uWXYZ` -> `<unicode codepoint 0xWXYZ>`
escape.dont_escape = '[[:print:]]--["\\]' # Always escape quote and backslash, but don't escape any
# other printable char

0 comments on commit 1dcb8a5

Please sign in to comment.