From d3ace6743063cd6ddf38fe07084a500fbf8d2b69 Mon Sep 17 00:00:00 2001 From: hgrsd Date: Fri, 29 Mar 2024 20:56:49 +0000 Subject: [PATCH] smarter strings --- Cargo.lock | 40 +++++++++++++++++++++++ Cargo.toml | 2 ++ src/lib.rs | 95 ++++++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 123 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b34ca4a..b75a679 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,10 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "drivel" version = "0.1.0" dependencies = [ + "lazy_static", + "regex", "serde", "serde_json", ] @@ -16,6 +27,18 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "memchr" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" + [[package]] name = "proc-macro2" version = "1.0.79" @@ -34,6 +57,23 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + [[package]] name = "ryu" version = "1.0.17" diff --git a/Cargo.toml b/Cargo.toml index 3b3955a..ebcd5ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,5 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +lazy_static = "1.4.0" +regex = "1.8.1" serde_json = "1.0.115" serde = { version = "1.0.197", features = ["derive"] } diff --git a/src/lib.rs b/src/lib.rs index e0d5f7a..4817f82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,31 @@ -use serde::Serialize; +#[macro_use] +extern crate lazy_static; + +lazy_static! { + static ref IsoDateRegex: regex::Regex = regex::Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); + static ref IsoDateTimeRegex: regex::Regex = regex::Regex::new( + r"^\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z)$" + ) + .unwrap(); + static ref UUIDRegex: regex::Regex = + regex::Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$") + .unwrap(); +} + +#[derive(PartialEq, Eq, Debug)] +pub enum StringType { + Unknown, + IsoDate, + IsoDateTime, + UUID, +} -#[derive(PartialEq, Eq, Debug, Serialize)] +#[derive(PartialEq, Eq, Debug)] pub enum SchemaState { Initial, Null, Nullable(Box), - String, + String(StringType), Number { float: bool, }, @@ -21,7 +41,7 @@ pub enum SchemaState { fn merge(initial: SchemaState, new: SchemaState) -> SchemaState { match (initial, new) { (SchemaState::Initial, SchemaState::Null) => SchemaState::Null, - (SchemaState::Initial, SchemaState::String) => SchemaState::String, + (SchemaState::Initial, SchemaState::String(x)) => SchemaState::String(x), (SchemaState::Initial, SchemaState::Boolean) => SchemaState::Boolean, (SchemaState::Initial, SchemaState::Number { float }) => SchemaState::Number { float }, (SchemaState::Initial, SchemaState::Array(inner)) => SchemaState::Array(inner), @@ -29,7 +49,13 @@ fn merge(initial: SchemaState, new: SchemaState) -> SchemaState { SchemaState::Object { required, optional } } - (SchemaState::String, SchemaState::String) => SchemaState::String, + (SchemaState::String(first_type), SchemaState::String(second_type)) => { + SchemaState::String(if first_type == second_type { + first_type + } else { + StringType::Unknown + }) + } (SchemaState::Number { float: true }, SchemaState::Number { float: _ }) => { SchemaState::Number { float: true } @@ -128,7 +154,18 @@ fn infer_array_schema(values: &Vec) -> SchemaState { pub fn infer_schema(json: &serde_json::Value) -> SchemaState { match json { serde_json::Value::Null => SchemaState::Null, - serde_json::Value::String(_) => SchemaState::String, + serde_json::Value::String(value) => { + let t = if IsoDateRegex.is_match(value) { + StringType::IsoDate + } else if IsoDateTimeRegex.is_match(value) { + StringType::IsoDateTime + } else if UUIDRegex.is_match(value) { + StringType::UUID + } else { + StringType::Unknown + }; + SchemaState::String(t) + } serde_json::Value::Number(n) => SchemaState::Number { float: n.is_f64() }, serde_json::Value::Bool(_) => SchemaState::Boolean, serde_json::Value::Array(array) => SchemaState::Array(Box::new(infer_array_schema(array))), @@ -157,11 +194,35 @@ mod tests { } #[test] - fn infers_string() { + fn infers_string_unknown_type() { let input = json!("foo"); let schema = infer_schema(&input); - assert_eq!(schema, SchemaState::String) + assert_eq!(schema, SchemaState::String(StringType::Unknown)) + } + + #[test] + fn infers_string_iso_date() { + let input = json!("2013-01-12"); + let schema = infer_schema(&input); + + assert_eq!(schema, SchemaState::String(StringType::IsoDate)) + } + + #[test] + fn infers_string_iso_date_time() { + let input = json!("2013-01-12T00:00:00.000Z"); + let schema = infer_schema(&input); + + assert_eq!(schema, SchemaState::String(StringType::IsoDateTime)) + } + + #[test] + fn infers_string_uuid() { + let input = json!("988c2c6d-df1b-4bb9-b837-6ba706c0b4ad"); + let schema = infer_schema(&input); + + assert_eq!(schema, SchemaState::String(StringType::UUID)) } #[test] @@ -215,13 +276,16 @@ mod tests { schema, SchemaState::Object { required: std::collections::HashMap::from_iter([ - ("string".to_string(), SchemaState::String), + ( + "string".to_string(), + SchemaState::String(StringType::Unknown) + ), ("int".to_string(), SchemaState::Number { float: false }), ("float".to_string(), SchemaState::Number { float: true }), ("bool".to_string(), SchemaState::Boolean), ( "array".to_string(), - SchemaState::Array(Box::new(SchemaState::String)) + SchemaState::Array(Box::new(SchemaState::String(StringType::Unknown))) ), ("null".to_string(), SchemaState::Null), ( @@ -229,7 +293,7 @@ mod tests { SchemaState::Object { required: std::collections::HashMap::from_iter([( "string".to_owned(), - SchemaState::String + SchemaState::String(StringType::Unknown) )]), optional: std::collections::HashMap::new(), } @@ -253,7 +317,10 @@ mod tests { let input = json!(["foo", "bar"]); let schema = infer_schema(&input); - assert_eq!(schema, SchemaState::Array(Box::new(SchemaState::String))); + assert_eq!( + schema, + SchemaState::Array(Box::new(SchemaState::String(StringType::Unknown))) + ); } #[test] @@ -313,7 +380,7 @@ mod tests { ]), optional: std::collections::HashMap::from_iter([( "foo".to_owned(), - SchemaState::String + SchemaState::String(StringType::Unknown) )]) })) ) @@ -338,7 +405,7 @@ mod tests { assert_eq!( schema, SchemaState::Array(Box::new(SchemaState::Nullable(Box::new( - SchemaState::String + SchemaState::String(StringType::Unknown) )))) ); }