diff --git a/Cargo.lock b/Cargo.lock index 9991ee6..6a57cb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21,6 +21,7 @@ dependencies = [ "fake", "itertools", "lazy_static", + "log", "mimalloc", "native-tls", "postgres", diff --git a/Cargo.toml b/Cargo.toml index 98a24f5..cf1633c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ serde = { version = "1.0", features = ["derive"] } structopt = "0.3" uuid = { version = "0.8", features = [ "v4"] } mimalloc = "0.1.29" +log = "0.4.17" zstd = "0.11.2" [dev-dependencies] diff --git a/src/file_reader.rs b/src/file_reader.rs index 2c28d8b..c81dee1 100644 --- a/src/file_reader.rs +++ b/src/file_reader.rs @@ -68,6 +68,8 @@ mod tests { strategy_tuple("id"), strategy_tuple("description"), strategy_tuple("price"), + strategy_tuple("details"), + strategy_tuple("tags"), ]), ); diff --git a/src/parsers/data_row.rs b/src/parsers/data_row.rs new file mode 100644 index 0000000..7965203 --- /dev/null +++ b/src/parsers/data_row.rs @@ -0,0 +1,23 @@ +use core::str::Split; +pub fn split(line: &str) -> Split<'_, char> { + line.strip_suffix('\n').unwrap_or(line).split('\t') +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_split() { + let line = "1\t2\t3\t4\n"; + let result: Vec<&str> = split(line).collect(); + assert_eq!(result, vec!["1", "2", "3", "4"]); + } + + #[test] + fn can_split_with_empty_string_at_end() { + let line = "1\t2\t3\t\n"; + let result: Vec<&str> = split(line).collect(); + assert_eq!(result, vec!["1", "2", "3", ""]); + } +} diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs index 0d2b2f5..6c1ae94 100644 --- a/src/parsers/mod.rs +++ b/src/parsers/mod.rs @@ -1,5 +1,6 @@ pub mod copy_row; pub mod create_row; +pub mod data_row; pub mod db_schema; pub mod national_insurance_number; pub mod rng; diff --git a/src/parsers/row_parser.rs b/src/parsers/row_parser.rs index b86094d..21a7b5e 100644 --- a/src/parsers/row_parser.rs +++ b/src/parsers/row_parser.rs @@ -1,4 +1,3 @@ -use crate::parsers::copy_row; use crate::parsers::copy_row::CurrentTableTransforms; use crate::parsers::create_row; use crate::parsers::sanitiser; @@ -7,6 +6,7 @@ use crate::parsers::strategies::Strategies; use crate::parsers::transformer; use crate::parsers::types; use crate::parsers::types::Column; +use crate::parsers::{copy_row, data_row}; use itertools::Itertools; use rand::rngs::SmallRng; use std::borrow::Cow; @@ -114,7 +114,7 @@ fn transform_row( current_table: &CurrentTableTransforms, types: &Types, ) -> String { - let column_values = split_row(line); + let column_values = data_row::split(line); let mut transformed = column_values.enumerate().map(|(i, value)| { let current_column = ¤t_table.columns[i]; @@ -152,10 +152,6 @@ fn add_create_table_row_to_types(line: &str, mut current_types: Vec) -> current_types } -fn split_row(line: &str) -> std::str::Split { - line.strip_suffix('\n').unwrap_or(line).split('\t') -} - #[cfg(test)] mod tests { use super::*; @@ -379,6 +375,33 @@ mod tests { assert_eq!(non_table_data_row, transformed_row); } + #[test] + fn table_data_with_empty_final_column() { + let table_data_row = "123\tPeter\t\n"; + let strategies = Strategies::new_from("public.users".to_string(), HashMap::new()); + + let mut state = State { + position: Position::InCopy { + current_table: CurrentTableTransforms { + table_name: "public.users".to_string(), + columns: vec![ + ColumnInfo::builder().with_name("column_1").build(), + ColumnInfo::builder().with_name("column_2").build(), + ColumnInfo::builder().with_name("column_3").build(), + ], + }, + }, + types: Types::builder() + .add_type("public.users", "column_1", SubType::Character) + .add_type("public.users", "column_2", SubType::Character) + .add_type("public.users", "column_3", SubType::Character) + .build(), + }; + let mut rng = rng::get(); + let transformed_row = parse(&mut rng, table_data_row, &mut state, &strategies); + assert_eq!("123\tPeter\t\n", transformed_row); + } + #[test] fn table_data_is_transformed() { let table_data_row = "123\tPeter\tPuckleberry\n"; diff --git a/src/parsers/sanitiser.rs b/src/parsers/sanitiser.rs index de8925e..df0b5f9 100644 --- a/src/parsers/sanitiser.rs +++ b/src/parsers/sanitiser.rs @@ -1,7 +1,33 @@ pub fn trim(line: &str) -> &str { - line.trim() + line.trim_matches(|c| c == ' ' || c == '\n') } pub fn dequote_column_or_table_name_data(line: &str) -> String { line.replace('\"', "") } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn trims_leading_whitespace_and_newline() { + let line = " 1234\n"; + let result = trim(line); + assert_eq!(result, "1234"); + } + + #[test] + fn does_not_trim_tabs() { + let line = " \t1234\t\n"; + let result = trim(line); + assert_eq!(result, "\t1234\t"); + } + + #[test] + fn dequotes_given_string() { + let line = "\"order\""; + let result = dequote_column_or_table_name_data(line); + assert_eq!(result, "order"); + } +} diff --git a/src/parsers/transformer.rs b/src/parsers/transformer.rs index 16c0f8e..089a710 100644 --- a/src/parsers/transformer.rs +++ b/src/parsers/transformer.rs @@ -12,10 +12,12 @@ use fake::faker::company::en::*; use fake::faker::internet::en::*; use fake::faker::name::en::*; use fake::Fake; +use log::trace; use rand::SeedableRng; use rand::{rngs::SmallRng, Rng}; use std::borrow::Cow; use std::collections::HashMap; +use std::fmt::Write; use std::sync::atomic::{AtomicUsize, Ordering}; use uuid::Uuid; @@ -49,6 +51,9 @@ pub fn transform<'line>( let unique = get_unique(); + //TODO error if inappropriate transformer for type is used e.g. scramble for json should give + //nice error rather than making invalid sql + match transformer.name { TransformerType::Error => { panic!("Error transform still in place for table: {}", table_name) @@ -70,12 +75,12 @@ pub fn transform<'line>( TransformerType::FakeStreetAddress => Cow::from(fake_street_address()), TransformerType::FakeState => Cow::from(StateName().fake::()), TransformerType::FakeUsername => Cow::from(fake_username(&transformer.args, unique)), - //TODO not tested VV - TransformerType::FakeUUID => Cow::from(Uuid::new_v4().to_string()), + TransformerType::Scramble => Cow::from(scramble(rng, value)), + TransformerType::ObfuscateDay => Cow::from(obfuscate_day(value, table_name)), TransformerType::Fixed => fixed(&transformer.args, table_name), TransformerType::Identity => Cow::from(value), - TransformerType::ObfuscateDay => Cow::from(obfuscate_day(value, table_name)), - TransformerType::Scramble => Cow::from(scramble(rng, value)), + //TODO not tested VV + TransformerType::FakeUUID => Cow::from(Uuid::new_v4().to_string()), } } @@ -86,34 +91,80 @@ fn transform_array<'value>( transformer: &Transformer, table_name: &str, ) -> Cow<'value, str> { - let is_string_array = underlying_type == &SubType::Character; - let unsplit_array = &value[1..value.len() - 1]; + let quoted_types = vec![SubType::Character, SubType::Json]; + let requires_quotes = quoted_types.contains(underlying_type); let sub_type = SingleValue { sub_type: underlying_type.clone(), }; - let array: Vec<_> = unsplit_array - .split(", ") - .map(|list_item| { - if is_string_array { - let list_item_without_enclosing_quotes = &list_item[1..list_item.len() - 1]; - let transformed = transform( - rng, - list_item_without_enclosing_quotes, - &sub_type, - transformer, - table_name, - ); - - Cow::from(format!("\"{}\"", transformed)) - } else { - transform(rng, list_item, &sub_type, transformer, table_name) - } - }) - .collect(); + let transformed_array = if requires_quotes { + transform_quoted_array(rng, value, &sub_type, transformer, table_name) + } else { + let unsplit_array = &value[1..value.len() - 1]; + unsplit_array + .split(", ") + .map(|list_item| transform(rng, list_item, &sub_type, transformer, table_name)) + .collect::>>() + .join(",") + }; + Cow::from(format!("{{{}}}", transformed_array)) +} + +fn transform_quoted_array<'value>( + rng: &mut SmallRng, + value: &'value str, + sub_type: &Type, + transformer: &Transformer, + table_name: &str, +) -> String { + let mut inside_word = false; + let mut word_is_quoted = false; + let mut current_word: String = "".to_string(); + let mut word_acc: String = "".to_string(); + let mut last_char_seen: char = 'a'; + let last_char_index = value.len() - 1; + for (i, c) in value.chars().enumerate() { + trace!("-----------"); + trace!("current value is '{}'", c); + if i == 0 { + continue; + } else if !inside_word && c == '"' { + word_is_quoted = true; + continue; + } else if !inside_word && c == ',' { + continue; + } else if inside_word + && ((word_is_quoted && c == '"' && last_char_seen != '\\') + || (!word_is_quoted && c == ',') + || (!word_is_quoted && c == '}')) + { + inside_word = false; + word_is_quoted = false; + let transformed = transform(rng, ¤t_word, sub_type, transformer, table_name); + write!(word_acc, "\"{}\",", &transformed) + .expect("Should be able to apppend to word_acc"); + current_word = "".to_string(); + trace!("its the end of a word"); + } else { + inside_word = true; + current_word.push(c); + } - Cow::from(format!("{{{}}}", array.join(", "))) + last_char_seen = c; + trace!( + "current_word: '{}', inside_word: '{}', last_char_seen: '{}', index: '{}/{}'", + current_word, + inside_word, + last_char_seen, + i, + last_char_index + ); + } + trace!("\noutput - {:?}", word_acc); + //Remove the trailing comma from line: 145! + word_acc.pop(); + word_acc } fn prepend_unique_if_present( @@ -275,7 +326,6 @@ fn scramble(rng: &mut SmallRng, original_value: &str) -> String { #[cfg(test)] mod tests { use super::*; - use crate::parsers::national_insurance_number; use crate::parsers::rng; use regex::Regex; @@ -924,7 +974,32 @@ mod tests { #[test] fn can_scramble_array_string_fields() { - let initial_value = "{\"A\", \"B\"}"; + let initial_value = r#"{a,b,"c or d"}"#; + let mut rng = rng::get(); + let new_value = transform( + &mut rng, + initial_value, + &Type::Array { + sub_type: SubType::Character, + }, + &Transformer { + name: TransformerType::Scramble, + args: None, + }, + TABLE_NAME, + ); + assert!(new_value != initial_value); + let re = Regex::new(r#"^\{"[a-z]","[a-z]","[a-z] [a-z]{2} [a-z]"\}$"#).unwrap(); + assert!( + re.is_match(&new_value), + "new value: \"{}\" does not contain same digit / alphabet structure as input", + new_value + ); + } + + #[test] + fn can_deal_with_commas_inside_values() { + let initial_value = r#"{"A, or B",C}"#; let mut rng = rng::get(); let new_value = transform( &mut rng, @@ -939,7 +1014,7 @@ mod tests { TABLE_NAME, ); assert!(new_value != initial_value); - let re = Regex::new(r#"^\{"[a-z]", "[a-z]"\}$"#).unwrap(); + let re = Regex::new(r#"^\{"[a-z]{2} [a-z]{2} [a-z]","[a-z]"\}$"#).unwrap(); assert!( re.is_match(&new_value), "new value: \"{}\" does not contain same digit / alphabet structure as input", @@ -984,7 +1059,7 @@ mod tests { TABLE_NAME, ); assert!(new_value != initial_value); - let re = Regex::new(r#"^\{[0-9], [0-9]{2}, [0-9]{3}, [0-9]{4}\}$"#).unwrap(); + let re = Regex::new(r#"^\{[0-9],[0-9]{2},[0-9]{3},[0-9]{4}\}$"#).unwrap(); assert!( re.is_match(&new_value), "new value: \"{}\" does not contain same digit / alphabet structure as input", @@ -992,6 +1067,25 @@ mod tests { ); } + #[test] + fn json_array() { + let json = r#"{"{\\"sender\\": \\"pablo\\"}","{\\"sender\\": \\"barry\\"}"}"#; + let mut rng = rng::get(); + let new_json = transform( + &mut rng, + json, + &Type::Array { + sub_type: SubType::Json, + }, + &Transformer { + name: TransformerType::EmptyJson, + args: None, + }, + TABLE_NAME, + ); + assert_eq!(new_json, "{\"{}\",\"{}\"}"); + } + #[test] fn empty_json() { let json = "{\"foo\": \"bar\"}"; @@ -1000,7 +1094,9 @@ mod tests { &mut rng, json, &Type::SingleValue { - sub_type: SubType::Character, + sub_type: SubType::Unknown { + underlying_type: "jsonb".to_string(), + }, }, &Transformer { name: TransformerType::EmptyJson, diff --git a/src/parsers/types.rs b/src/parsers/types.rs index dab16ef..829294c 100644 --- a/src/parsers/types.rs +++ b/src/parsers/types.rs @@ -24,7 +24,7 @@ impl Type { #[derive(Clone, Debug, PartialEq)] pub enum SubType { - //TODO JSON?!? + Json, Character, Integer, Unknown { underlying_type: String }, @@ -100,6 +100,8 @@ fn string_to_type(type_string: String) -> Type { SubType::Character } else if type_string.starts_with("bigint") || type_string.starts_with("integer") { SubType::Integer + } else if type_string.starts_with("jsonb") { + SubType::Json } else { SubType::Unknown { underlying_type: type_string.clone(), @@ -161,6 +163,14 @@ mod tests { assert_eq!(parsed.data_type, Type::array(SubType::Character)); } + #[test] + fn parses_array_of_jsonb_type() { + let row = "errors jsonb[] DEFAULT ARRAY[]::jsonb[] NOT NULL,"; + let parsed = parse(row).expect("Expected a column back! but got None"); + assert_eq!(parsed.name, "errors"); + assert_eq!(parsed.data_type, Type::array(SubType::Json)); + } + //These are written based on the BNF here: https://www.postgresql.org/docs/current/sql-createtable.html #[test] diff --git a/test_files/dump_file.sql b/test_files/dump_file.sql index 10ee9fe..8c9ca9d 100644 --- a/test_files/dump_file.sql +++ b/test_files/dump_file.sql @@ -52,7 +52,9 @@ ALTER TABLE public.orders ALTER COLUMN id ADD GENERATED ALWAYS AS IDENTITY ( CREATE TABLE public.products ( id bigint NOT NULL, description text NOT NULL, - price numeric(15,4) NOT NULL + price numeric(15,4) NOT NULL, + details jsonb[] DEFAULT ARRAY[]::jsonb[] NOT NULL, + tags character varying(255)[] DEFAULT (ARRAY[]::character varying[])::character varying(255)[] NOT NULL ); @@ -123,11 +125,11 @@ COPY public.orders (id, user_id, product_id) FROM stdin; -- Data for Name: products; Type: TABLE DATA; Schema: public; Owner: - -- -COPY public.products (id, description, price) FROM stdin; -1 a wonderful pair of trousers 24.9900 -2 a kicking pair of trainers 34.9900 -3 a warm winter coat 44.9900 -4 crocs 54.9900 +COPY public.products (id, description, price, details, tags) FROM stdin; +1 a wonderful pair of trousers 24.9900 {"{\\"sender\\": \\"pablo\\"}","{\\"sender\\": \\"barry\\"}"} {amazing,glam,"ugly as sin"} +2 a kicking pair of trainers 34.9900 {"{\\"sender\\": \\"pablo\\"}","{\\"sender\\": \\"barry\\"}"} {amazing,glam,"ugly as sin"} +3 a warm winter coat 44.9900 {"{\\"sender\\": \\"pablo\\"}","{\\"sender\\": \\"barry\\"}"} {amazing,glam,"ugly as sin"} +4 crocs 54.9900 {"{\\"sender\\": \\"pablo\\"}","{\\"sender\\": \\"barry\\"}"} {amazing,glam,"ugly as sin"} \. diff --git a/test_files/strategy.json b/test_files/strategy.json index fec9fc4..a468001 100644 --- a/test_files/strategy.json +++ b/test_files/strategy.json @@ -56,6 +56,22 @@ "transformer": { "name": "Identity" } + }, + { + "data_category": "General", + "description": "", + "name": "details", + "transformer": { + "name": "EmptyJson" + } + }, + { + "data_category": "General", + "description": "", + "name": "tags", + "transformer": { + "name": "Scramble" + } } ] },