diff --git a/Cargo.lock b/Cargo.lock index 6026fed..bc19874 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -177,7 +177,7 @@ dependencies = [ [[package]] name = "drivel" -version = "0.2.1" +version = "0.2.2" dependencies = [ "chrono", "clap", diff --git a/Cargo.toml b/Cargo.toml index 9ebc9bc..cf5460a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "drivel" description = "Infer a schema from JSON input, and generate synthetic data based on the inferred schema." license = "MIT" authors = ["Daniƫl Hogers "] -version = "0.2.1" +version = "0.2.2" edition = "2021" repository = "https://github.com/hgrsd/drivel" diff --git a/src/infer_string.rs b/src/infer_string.rs index 65e26a0..f7d4092 100644 --- a/src/infer_string.rs +++ b/src/infer_string.rs @@ -11,27 +11,62 @@ lazy_static! { regex::Regex::new(r"[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]{2,}$").unwrap(); } -pub(crate) fn infer_string_type(s: &str) -> StringType { - if ISO_DATE_REGEX.is_match(s) { - StringType::IsoDate - } else if chrono::DateTime::parse_from_rfc2822(s).is_ok() { - StringType::DateTimeISO8601 - } else if chrono::DateTime::parse_from_rfc3339(s).is_ok() { - StringType::DateTimeISO8601 - } else if UUIDREGEX.is_match(s) { - StringType::UUID - } else if EMAIL_REGEX.is_match(s) { - StringType::Email - } else if url::Url::parse(s).is_ok() { - StringType::Url - } else if HOSTNAME_REGEX.is_match(s) { - StringType::Hostname +fn uuid(s: &str) -> Option { + if s.len() == 36 && UUIDREGEX.is_match(s) { + Some(StringType::UUID) + } else { + None + } +} + +fn email(s: &str) -> Option { + if s.contains('@') && EMAIL_REGEX.is_match(s) { + Some(StringType::Email) } else { - StringType::Unknown { - strings_seen: vec![s.to_owned()], - chars_seen: s.chars().collect(), - min_length: Some(s.len()), - max_length: Some(s.len()), + None + } +} + +fn url_host(s: &str) -> Option { + if s.contains('.') { + if url::Url::parse(s).is_ok() { + return Some(StringType::Url); + } + if HOSTNAME_REGEX.is_match(s) { + return Some(StringType::Hostname); } } + None +} + +fn dates(s: &str) -> Option { + if s.chars().take(1).all(|char| char.is_numeric()) { + if ISO_DATE_REGEX.is_match(s) { + return Some(StringType::IsoDate); + } + if chrono::DateTime::parse_from_rfc3339(s).is_ok() { + return Some(StringType::DateTimeISO8601); + } + } + + if chrono::DateTime::parse_from_rfc2822(s).is_ok() { + return Some(StringType::DateTimeISO8601); + } + + None +} + +pub(crate) fn infer_string_type(s: &str) -> StringType { + for matcher in [uuid, email, url_host, dates] { + if let Some(string_type) = matcher(s) { + return string_type; + } + } + + return StringType::Unknown { + strings_seen: vec![s.to_owned()], + chars_seen: s.chars().collect(), + min_length: Some(s.len()), + max_length: Some(s.len()), + }; }