From 3a575b40ba597a09d183794952b846c453a755ec Mon Sep 17 00:00:00 2001 From: Leonid Ryzhyk Date: Mon, 2 Mar 2020 10:04:59 -0800 Subject: [PATCH] Deserialize records on the fly. Consider a scenario where input relations contain JSON strings. These strings are parsed using the recently added JSON lib and then DDlog goes on to compute on the parsed data. When working with large JSON documents where only a handful of attributes are actually used by DDlog, DDlog's memory footprint is dominated by unparsed JSON inputs. To avoid this waste, we introduce a new record type that contains serialized data represented as a string. DDlog parses this string using appropriate deserializer (currently, only json is supported) and drops the string on the floor, keeping only the extracted data in memory. Syntactically, the new type of record is written down as `@`, where `` can be either a quoted string literal or a reference to a file, e.g.: ``` insert json_test.Deserialized[@json"{\"@type\": \"t.V1\", \"b\": true}"] ``` --- lib/json.toml | 2 - rust/template/cmd_parser/parse.rs | 40 +++++++++++++++++--- rust/template/differential_datalog/record.rs | 38 ++++--------------- rust/template/types/Cargo.toml | 1 + src/Language/DifferentialDatalog/Compile.hs | 11 +++++- test/datalog_tests/json_test.dat | 6 +++ test/datalog_tests/json_test.dl | 4 ++ test/datalog_tests/json_test.dump.expected | 1 + 8 files changed, 64 insertions(+), 39 deletions(-) delete mode 100644 lib/json.toml diff --git a/lib/json.toml b/lib/json.toml deleted file mode 100644 index 594677e20..000000000 --- a/lib/json.toml +++ /dev/null @@ -1,2 +0,0 @@ -[dependencies.serde_json] -version = "1.0" diff --git a/rust/template/cmd_parser/parse.rs b/rust/template/cmd_parser/parse.rs index 186ee2bd2..1a302bdb3 100644 --- a/rust/template/cmd_parser/parse.rs +++ b/rust/template/cmd_parser/parse.rs @@ -314,7 +314,7 @@ named!(rel_key<&[u8], (Name, Record)>, ); named!(record<&[u8], Record>, - alt!(bool_val | string_val | string_val_from_file | tuple_val | array_val | struct_val | int_val ) + alt!(bool_val | string_val | serialized_val | tuple_val | array_val | struct_val | int_val ) ); named!(named_record<&[u8], (Name, Record)>, @@ -368,24 +368,32 @@ named!(string_literal<&[u8], String>, ) ); -named!(string_val_from_file<&[u8], Record>, +named!(string_from_file<&[u8], String>, do_parse!( tag!("%") >> fname: string_literal >> - (Record::String(std::fs::read_to_string(std::path::Path::new(&fname)).map_err(|e|format!("Failed to read string from file {}: {}", fname, e)).unwrap())) + (std::fs::read_to_string(std::path::Path::new(&fname)).map_err(|e|format!("Failed to read string from file {}: {}", fname, e)).unwrap()) ) ); -named!(string_val<&[u8], Record>, +named!(string_inline<&[u8], String>, do_parse!( str: string_literal >> - (Record::String(str)) + (str) ) ); +named!(string_token<&[u8], String>, + alt!(string_inline | string_from_file) +); + +named!(string_val<&[u8], Record>, + map!(string_token, Record::String) +); + #[test] fn test_string() { assert_eq!( @@ -406,6 +414,28 @@ fn test_string() { ); } +named!(serialized_val<&[u8], Record>, + do_parse!( + tag!("@") + >> + format_name: identifier + >> + data: string_token + >> + (Record::Serialized(Cow::from(format_name), data)) + ) +); + +#[test] +fn test_serialized() { + assert_eq!( + serialized_val(br###"@json"foo""###), + Ok(( + &br""[..], + Record::Serialized(Cow::from("json"), "foo".to_string()) + )) + ); +} named!(tuple_val<&[u8], Record>, delimited!(apply!(sym,"("), map!(separated_list!(apply!(sym,","), record), Record::Tuple), diff --git a/rust/template/differential_datalog/record.rs b/rust/template/differential_datalog/record.rs index 5a09bb147..61bc0478e 100644 --- a/rust/template/differential_datalog/record.rs +++ b/rust/template/differential_datalog/record.rs @@ -51,6 +51,9 @@ pub enum Record { Bool(bool), Int(BigInt), String(String), + /// Value serialized in a string. The first field stores the name of the + /// serialization format, e.g., "json". + Serialized(Name, String), Tuple(Vec), Array(CollectionKind, Vec), PosStruct(Name, Vec), @@ -64,6 +67,10 @@ impl fmt::Display for Record { Record::Bool(false) => write!(f, "false"), Record::Int(i) => i.fmt(f), Record::String(s) => format_ddlog_str(s.as_ref(), f), + Record::Serialized(n, s) => { + write!(f, "#{}", n)?; + format_ddlog_str(s.as_ref(), f) + } Record::Tuple(recs) => { write!(f, "(")?; let len = recs.len(); @@ -1454,7 +1461,7 @@ macro_rules! decl_record_mutator_struct { #[macro_export] macro_rules! decl_record_mutator_enum { ( $n:ident, <$( $targ:ident),*>, $($cons:ident {$( $arg:ident : $type:ty),*}),* ) => { - impl<$($targ: $crate::record::FromRecord+Default),*> $crate::record::Mutator<$n<$($targ),*>> for $crate::record::Record + impl<$($targ: $crate::record::FromRecord+serde::de::DeserializeOwned+Default),*> $crate::record::Mutator<$n<$($targ),*>> for $crate::record::Record where $($crate::record::Record: $crate::record::Mutator<$targ>),* { fn mutate(&self, x: &mut $n<$($targ),*>) -> Result<(), String> { @@ -1533,35 +1540,6 @@ macro_rules! decl_val_enum_into_record { }; } -#[macro_export] -macro_rules! decl_record_mutator_val_enum { - ( $n:ident, <$( $targ:ident),*>, $($cons:ident ($type:ty)),* ) => { - impl<$($targ: $crate::record::FromRecord+Default),*> $crate::record::Mutator<$n<$($targ),*>> for $crate::record::Record - where $($crate::record::Record: $crate::record::Mutator<$targ>),* - { - fn mutate(&self, x: &mut $n<$($targ),*>) -> Result<(), String> { - match self { - $crate::record::Record::PosStruct(..) => { - Err(format!("Cannot use positional struct as mutator")) - }, - $crate::record::Record::NamedStruct(_, args) => { - match x { - $( - $n::$cons(v) => { - >::mutate(self, v) - } - ),* - } - }, - _ => { - Result::Err(format!("not a struct {:?}", *self)) - } - } - } - } - }; -} - #[cfg(test)] mod tests { use super::*; diff --git a/rust/template/types/Cargo.toml b/rust/template/types/Cargo.toml index 49eeebb9e..0bf9018b4 100644 --- a/rust/template/types/Cargo.toml +++ b/rust/template/types/Cargo.toml @@ -18,6 +18,7 @@ lazy_static = "1.3" libc = "0.2" num-traits = "0.2" serde = {version = "1.0", features = ["derive"]} +serde_json = "1.0" timely = "0.11" twox-hash = "1.1" diff --git a/src/Language/DifferentialDatalog/Compile.hs b/src/Language/DifferentialDatalog/Compile.hs index 85270da44..86ceb6cfe 100644 --- a/src/Language/DifferentialDatalog/Compile.hs +++ b/src/Language/DifferentialDatalog/Compile.hs @@ -713,15 +713,22 @@ mkFromRecord t@TypeDef{..} = " c => result::Result::Err(format!(\"unknown constructor {} of type" <+> rname (name t) <+> "in {:?}\", c, *val))" $$ " }" $$ " }," $$ + " record::Record::Serialized(format, s) => {" $$ + " if format == \"json\" {" $$ + " serde_json::from_str(&*s).map_err(|e|format!(\"{}\", e))" $$ + " } else {" $$ + " result::Result::Err(format!(\"unsupported serialization format '{}'\", format))" $$ + " }" $$ + " }," $$ " v => {" $$ - " result::Result::Err(format!(\"not a struct {:?}\", *v))" $$ + " result::Result::Err(format!(\"not a struct {:?}\", *v))" $$ " }" $$ " }" $$ " }" $$ "}" where targs = "<" <> (hcat $ punctuate comma $ map pp tdefArgs) <> ">" - targs_bounds = "<" <> (hcat $ punctuate comma $ map ((<> ": record::FromRecord + Default") . pp) tdefArgs) <> ">" + targs_bounds = "<" <> (hcat $ punctuate comma $ map ((<> ": record::FromRecord + serde::de::DeserializeOwned + Default") . pp) tdefArgs) <> ">" pos_constructors = vcat $ map mkposcons $ typeCons $ fromJust tdefType mkposcons :: Constructor -> Doc mkposcons c@Constructor{..} = diff --git a/test/datalog_tests/json_test.dat b/test/datalog_tests/json_test.dat index cab9283df..2314c92ed 100644 --- a/test/datalog_tests/json_test.dat +++ b/test/datalog_tests/json_test.dat @@ -1 +1,7 @@ dump json_test.JsonTest; + +start; +insert json_test.Deserialized[@json"{\"@type\": \"t.V1\", \"b\": true}"], +commit; + +dump json_test.ODeserialized; diff --git a/test/datalog_tests/json_test.dl b/test/datalog_tests/json_test.dl index 82f178118..a359cfd43 100644 --- a/test/datalog_tests/json_test.dl +++ b/test/datalog_tests/json_test.dl @@ -108,3 +108,7 @@ JsonTest(scalar3(), to_json_string_or_default(from_json_string(scalar3()): Result)). JsonTest(scalar4(), to_json_string_or_default(from_json_string(scalar4()): Result)). + +input relation Deserialized[TaggedEnum] +output relation ODeserialized[TaggedEnum] +ODeserialized[x] :- Deserialized[x]. diff --git a/test/datalog_tests/json_test.dump.expected b/test/datalog_tests/json_test.dump.expected index f7b91aa08..5a660bb17 100644 --- a/test/datalog_tests/json_test.dump.expected +++ b/test/datalog_tests/json_test.dump.expected @@ -14,3 +14,4 @@ json_test.JsonTest{.description = "{\"b\":true}", .value = "{\"std_Ok\":{\"res\" json_test.JsonTest{.description = "{\"foo\":\"bar\"}", .value = "{\"std_Err\":{\"err\":\"missing field `b` at line 1 column 13\"}}"} json_test.JsonTest{.description = "{\"t\":\"foo\", \"@id\":\"1001001001\", \"x\": \"x\", \"z\": 100000}", .value = "{\"std_Ok\":{\"res\":{\"t\":\"foo\",\"@id\":\"1001001001\",\"x\":\"x\",\"z\":100000}}}"} json_test.JsonTest{.description = "{\"t\":\"foo\", \"id\":\"1001001001\", \"nested\": {\"x\": \"x\", \"z\": 100000}}", .value = "{\"std_Ok\":{\"res\":{\"t\":\"foo\",\"id\":\"1001001001\",\"nested\":{\"x\":\"x\",\"z\":100000}}}}"} +json_test.TVariant1{.b = true}