Skip to content

Commit

Permalink
dealing with string arrays marginally better (#32)
Browse files Browse the repository at this point in the history
attempt to parse arrays in a more comprehensive way to deal with arrays of json and escaped quotes and commas
  • Loading branch information
danturn authored Aug 22, 2022
1 parent 84dfabd commit 20213f0
Show file tree
Hide file tree
Showing 11 changed files with 246 additions and 45 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ serde = { version = "1.0", features = ["derive"] }
structopt = "0.3"
uuid = { version = "0.8", features = [ "v4"] }
mimalloc = "0.1.29"
log = "0.4.17"
zstd = "0.11.2"

[dev-dependencies]
Expand Down
2 changes: 2 additions & 0 deletions src/file_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ mod tests {
strategy_tuple("id"),
strategy_tuple("description"),
strategy_tuple("price"),
strategy_tuple("details"),
strategy_tuple("tags"),
]),
);

Expand Down
23 changes: 23 additions & 0 deletions src/parsers/data_row.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
use core::str::Split;
pub fn split(line: &str) -> Split<'_, char> {
line.strip_suffix('\n').unwrap_or(line).split('\t')
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn can_split() {
let line = "1\t2\t3\t4\n";
let result: Vec<&str> = split(line).collect();
assert_eq!(result, vec!["1", "2", "3", "4"]);
}

#[test]
fn can_split_with_empty_string_at_end() {
let line = "1\t2\t3\t\n";
let result: Vec<&str> = split(line).collect();
assert_eq!(result, vec!["1", "2", "3", ""]);
}
}
1 change: 1 addition & 0 deletions src/parsers/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod copy_row;
pub mod create_row;
pub mod data_row;
pub mod db_schema;
pub mod national_insurance_number;
pub mod rng;
Expand Down
35 changes: 29 additions & 6 deletions src/parsers/row_parser.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::parsers::copy_row;
use crate::parsers::copy_row::CurrentTableTransforms;
use crate::parsers::create_row;
use crate::parsers::sanitiser;
Expand All @@ -7,6 +6,7 @@ use crate::parsers::strategies::Strategies;
use crate::parsers::transformer;
use crate::parsers::types;
use crate::parsers::types::Column;
use crate::parsers::{copy_row, data_row};
use itertools::Itertools;
use rand::rngs::SmallRng;
use std::borrow::Cow;
Expand Down Expand Up @@ -114,7 +114,7 @@ fn transform_row(
current_table: &CurrentTableTransforms,
types: &Types,
) -> String {
let column_values = split_row(line);
let column_values = data_row::split(line);

let mut transformed = column_values.enumerate().map(|(i, value)| {
let current_column = &current_table.columns[i];
Expand Down Expand Up @@ -152,10 +152,6 @@ fn add_create_table_row_to_types(line: &str, mut current_types: Vec<Column>) ->
current_types
}

fn split_row(line: &str) -> std::str::Split<char> {
line.strip_suffix('\n').unwrap_or(line).split('\t')
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -379,6 +375,33 @@ mod tests {
assert_eq!(non_table_data_row, transformed_row);
}

#[test]
fn table_data_with_empty_final_column() {
let table_data_row = "123\tPeter\t\n";
let strategies = Strategies::new_from("public.users".to_string(), HashMap::new());

let mut state = State {
position: Position::InCopy {
current_table: CurrentTableTransforms {
table_name: "public.users".to_string(),
columns: vec![
ColumnInfo::builder().with_name("column_1").build(),
ColumnInfo::builder().with_name("column_2").build(),
ColumnInfo::builder().with_name("column_3").build(),
],
},
},
types: Types::builder()
.add_type("public.users", "column_1", SubType::Character)
.add_type("public.users", "column_2", SubType::Character)
.add_type("public.users", "column_3", SubType::Character)
.build(),
};
let mut rng = rng::get();
let transformed_row = parse(&mut rng, table_data_row, &mut state, &strategies);
assert_eq!("123\tPeter\t\n", transformed_row);
}

#[test]
fn table_data_is_transformed() {
let table_data_row = "123\tPeter\tPuckleberry\n";
Expand Down
28 changes: 27 additions & 1 deletion src/parsers/sanitiser.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,33 @@
pub fn trim(line: &str) -> &str {
line.trim()
line.trim_matches(|c| c == ' ' || c == '\n')
}

pub fn dequote_column_or_table_name_data(line: &str) -> String {
line.replace('\"', "")
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn trims_leading_whitespace_and_newline() {
let line = " 1234\n";
let result = trim(line);
assert_eq!(result, "1234");
}

#[test]
fn does_not_trim_tabs() {
let line = " \t1234\t\n";
let result = trim(line);
assert_eq!(result, "\t1234\t");
}

#[test]
fn dequotes_given_string() {
let line = "\"order\"";
let result = dequote_column_or_table_name_data(line);
assert_eq!(result, "order");
}
}
158 changes: 127 additions & 31 deletions src/parsers/transformer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ use fake::faker::company::en::*;
use fake::faker::internet::en::*;
use fake::faker::name::en::*;
use fake::Fake;
use log::trace;
use rand::SeedableRng;
use rand::{rngs::SmallRng, Rng};
use std::borrow::Cow;
use std::collections::HashMap;
use std::fmt::Write;
use std::sync::atomic::{AtomicUsize, Ordering};
use uuid::Uuid;

Expand Down Expand Up @@ -49,6 +51,9 @@ pub fn transform<'line>(

let unique = get_unique();

//TODO error if inappropriate transformer for type is used e.g. scramble for json should give
//nice error rather than making invalid sql

match transformer.name {
TransformerType::Error => {
panic!("Error transform still in place for table: {}", table_name)
Expand All @@ -70,12 +75,12 @@ pub fn transform<'line>(
TransformerType::FakeStreetAddress => Cow::from(fake_street_address()),
TransformerType::FakeState => Cow::from(StateName().fake::<String>()),
TransformerType::FakeUsername => Cow::from(fake_username(&transformer.args, unique)),
//TODO not tested VV
TransformerType::FakeUUID => Cow::from(Uuid::new_v4().to_string()),
TransformerType::Scramble => Cow::from(scramble(rng, value)),
TransformerType::ObfuscateDay => Cow::from(obfuscate_day(value, table_name)),
TransformerType::Fixed => fixed(&transformer.args, table_name),
TransformerType::Identity => Cow::from(value),
TransformerType::ObfuscateDay => Cow::from(obfuscate_day(value, table_name)),
TransformerType::Scramble => Cow::from(scramble(rng, value)),
//TODO not tested VV
TransformerType::FakeUUID => Cow::from(Uuid::new_v4().to_string()),
}
}

Expand All @@ -86,34 +91,80 @@ fn transform_array<'value>(
transformer: &Transformer,
table_name: &str,
) -> Cow<'value, str> {
let is_string_array = underlying_type == &SubType::Character;
let unsplit_array = &value[1..value.len() - 1];
let quoted_types = vec![SubType::Character, SubType::Json];
let requires_quotes = quoted_types.contains(underlying_type);

let sub_type = SingleValue {
sub_type: underlying_type.clone(),
};

let array: Vec<_> = unsplit_array
.split(", ")
.map(|list_item| {
if is_string_array {
let list_item_without_enclosing_quotes = &list_item[1..list_item.len() - 1];
let transformed = transform(
rng,
list_item_without_enclosing_quotes,
&sub_type,
transformer,
table_name,
);

Cow::from(format!("\"{}\"", transformed))
} else {
transform(rng, list_item, &sub_type, transformer, table_name)
}
})
.collect();
let transformed_array = if requires_quotes {
transform_quoted_array(rng, value, &sub_type, transformer, table_name)
} else {
let unsplit_array = &value[1..value.len() - 1];
unsplit_array
.split(", ")
.map(|list_item| transform(rng, list_item, &sub_type, transformer, table_name))
.collect::<Vec<Cow<str>>>()
.join(",")
};
Cow::from(format!("{{{}}}", transformed_array))
}

fn transform_quoted_array<'value>(
rng: &mut SmallRng,
value: &'value str,
sub_type: &Type,
transformer: &Transformer,
table_name: &str,
) -> String {
let mut inside_word = false;
let mut word_is_quoted = false;
let mut current_word: String = "".to_string();
let mut word_acc: String = "".to_string();
let mut last_char_seen: char = 'a';
let last_char_index = value.len() - 1;
for (i, c) in value.chars().enumerate() {
trace!("-----------");
trace!("current value is '{}'", c);
if i == 0 {
continue;
} else if !inside_word && c == '"' {
word_is_quoted = true;
continue;
} else if !inside_word && c == ',' {
continue;
} else if inside_word
&& ((word_is_quoted && c == '"' && last_char_seen != '\\')
|| (!word_is_quoted && c == ',')
|| (!word_is_quoted && c == '}'))
{
inside_word = false;
word_is_quoted = false;
let transformed = transform(rng, &current_word, sub_type, transformer, table_name);
write!(word_acc, "\"{}\",", &transformed)
.expect("Should be able to apppend to word_acc");
current_word = "".to_string();
trace!("its the end of a word");
} else {
inside_word = true;
current_word.push(c);
}

Cow::from(format!("{{{}}}", array.join(", ")))
last_char_seen = c;
trace!(
"current_word: '{}', inside_word: '{}', last_char_seen: '{}', index: '{}/{}'",
current_word,
inside_word,
last_char_seen,
i,
last_char_index
);
}
trace!("\noutput - {:?}", word_acc);
//Remove the trailing comma from line: 145!
word_acc.pop();
word_acc
}

fn prepend_unique_if_present(
Expand Down Expand Up @@ -275,7 +326,6 @@ fn scramble(rng: &mut SmallRng, original_value: &str) -> String {
#[cfg(test)]
mod tests {
use super::*;
use crate::parsers::national_insurance_number;
use crate::parsers::rng;
use regex::Regex;

Expand Down Expand Up @@ -924,7 +974,32 @@ mod tests {

#[test]
fn can_scramble_array_string_fields() {
let initial_value = "{\"A\", \"B\"}";
let initial_value = r#"{a,b,"c or d"}"#;
let mut rng = rng::get();
let new_value = transform(
&mut rng,
initial_value,
&Type::Array {
sub_type: SubType::Character,
},
&Transformer {
name: TransformerType::Scramble,
args: None,
},
TABLE_NAME,
);
assert!(new_value != initial_value);
let re = Regex::new(r#"^\{"[a-z]","[a-z]","[a-z] [a-z]{2} [a-z]"\}$"#).unwrap();
assert!(
re.is_match(&new_value),
"new value: \"{}\" does not contain same digit / alphabet structure as input",
new_value
);
}

#[test]
fn can_deal_with_commas_inside_values() {
let initial_value = r#"{"A, or B",C}"#;
let mut rng = rng::get();
let new_value = transform(
&mut rng,
Expand All @@ -939,7 +1014,7 @@ mod tests {
TABLE_NAME,
);
assert!(new_value != initial_value);
let re = Regex::new(r#"^\{"[a-z]", "[a-z]"\}$"#).unwrap();
let re = Regex::new(r#"^\{"[a-z]{2} [a-z]{2} [a-z]","[a-z]"\}$"#).unwrap();
assert!(
re.is_match(&new_value),
"new value: \"{}\" does not contain same digit / alphabet structure as input",
Expand Down Expand Up @@ -984,14 +1059,33 @@ mod tests {
TABLE_NAME,
);
assert!(new_value != initial_value);
let re = Regex::new(r#"^\{[0-9], [0-9]{2}, [0-9]{3}, [0-9]{4}\}$"#).unwrap();
let re = Regex::new(r#"^\{[0-9],[0-9]{2},[0-9]{3},[0-9]{4}\}$"#).unwrap();
assert!(
re.is_match(&new_value),
"new value: \"{}\" does not contain same digit / alphabet structure as input",
new_value
);
}

#[test]
fn json_array() {
let json = r#"{"{\\"sender\\": \\"pablo\\"}","{\\"sender\\": \\"barry\\"}"}"#;
let mut rng = rng::get();
let new_json = transform(
&mut rng,
json,
&Type::Array {
sub_type: SubType::Json,
},
&Transformer {
name: TransformerType::EmptyJson,
args: None,
},
TABLE_NAME,
);
assert_eq!(new_json, "{\"{}\",\"{}\"}");
}

#[test]
fn empty_json() {
let json = "{\"foo\": \"bar\"}";
Expand All @@ -1000,7 +1094,9 @@ mod tests {
&mut rng,
json,
&Type::SingleValue {
sub_type: SubType::Character,
sub_type: SubType::Unknown {
underlying_type: "jsonb".to_string(),
},
},
&Transformer {
name: TransformerType::EmptyJson,
Expand Down
Loading

0 comments on commit 20213f0

Please sign in to comment.