Skip to content

Commit

Permalink
support JSON lines format (#4)
Browse files Browse the repository at this point in the history
* support JSON lines format

* cargo fmt

* docs

* rebase

* version
  • Loading branch information
hgrsd authored Apr 2, 2024
1 parent 5d894eb commit e92f23f
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "drivel"
description = "Infer a schema from JSON input, and generate synthetic data based on the inferred schema."
license = "MIT"
authors = ["Daniël Hogers <[email protected]>"]
version = "0.1.7"
version = "0.1.8"
edition = "2021"
repository = "https://github.com/hgrsd/drivel"

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# drivel

`drivel` is a command-line tool written in Rust for inferring a schema from
an example JSON file, and generating synthetic data based on this inferred schema. It offers two main modes of operation: 'describe' and 'produce'.
`drivel` is a command-line tool written in Rust for inferring a schema from an example JSON (or JSON lines) file, and generating synthetic data (the drivel in question)
based on the inferred schema.

## Features

Expand Down
101 changes: 101 additions & 0 deletions src/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,59 @@ pub fn infer_schema(json: &serde_json::Value) -> SchemaState {
}
}

/// Infer a schema, encoded as a SchemaState struct, from an iterator of JSON values.
///
/// This function iterates over a collection of JSON values and infers the schema by
/// merging schemas inferred from individual JSON values. The resulting schema reflects
/// the combined schema of all JSON values in the iterator.
///
/// # Example
///
/// ```
/// use serde_json::json;
/// use std::collections::{HashMap, HashSet};
/// use drivel::{infer_schema_from_iter, SchemaState, StringType, NumberType};
///
/// // Define a collection of JSON values
/// let values = vec![
/// json!({
/// "name": "Alice",
/// "age": 30,
/// "is_student": true
/// }),
/// json!({
/// "name": "Bob",
/// "age": 25,
/// "is_student": false
/// })
/// ];
///
/// // Infer the schema from the iterator of JSON values
/// let schema = infer_schema_from_iter(values.into_iter());
///
/// assert_eq!(
/// schema,
/// SchemaState::Object {
/// required: HashMap::from_iter([
/// ("name".to_string(), SchemaState::String(StringType::Unknown {
/// chars_seen: vec!['A', 'l', 'i', 'c', 'e', 'B', 'o', 'b'],
/// strings_seen: HashSet::from_iter(["Alice".to_string(), "Bob".to_string()]),
/// min_length: Some(3),
/// max_length: Some(5)
/// })),
/// ("age".to_string(), SchemaState::Number(NumberType::Integer { min: 25, max: 30 })),
/// ("is_student".to_string(), SchemaState::Boolean),
/// ]),
/// optional: HashMap::new()
/// }
/// );
/// ```
pub fn infer_schema_from_iter<'a>(values: impl Iterator<Item = serde_json::Value>) -> SchemaState {
values
.map(|value| infer_schema(&value))
.fold(SchemaState::Initial, merge)
}

#[cfg(test)]
mod tests {
use serde_json::json;
Expand Down Expand Up @@ -771,4 +824,52 @@ mod tests {

assert_eq!(schema_1, schema_2)
}

#[test]
fn infers_from_iter() {
let input = vec![
json!({
"foo": "bar",
"baz": 10,
"qux": true
}),
json!({
"baz": null,
"qux": false
}),
json!({
"foo": "barbar",
"baz": 20,
"qux": true
}),
];
let schema = infer_schema_from_iter(input.into_iter());
assert_eq!(
schema,
SchemaState::Object {
required: std::collections::HashMap::from_iter([
(
"baz".to_owned(),
SchemaState::Nullable(Box::new(SchemaState::Number(NumberType::Integer {
min: 10,
max: 20
})))
),
("qux".to_owned(), SchemaState::Boolean),
]),
optional: std::collections::HashMap::from_iter([(
"foo".to_owned(),
SchemaState::String(StringType::Unknown {
chars_seen: vec!['b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', 'r'],
strings_seen: std::collections::HashSet::from_iter([
"bar".to_string(),
"barbar".to_string()
]),
min_length: Some(3),
max_length: Some(6)
})
)])
}
);
}
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ mod infer;
mod produce;
mod schema;

pub use infer::infer_schema;
pub use infer::*;
pub use produce::produce;
pub use schema::*;
37 changes: 26 additions & 11 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,45 @@
use drivel::SchemaState;

fn main() {
let args: Vec<_> = std::env::args().collect();
let mode = args.get(1).expect(
"No mode provided. Usage: drivel [mode] <n repeat>, where mode is in (describe, produce)",
);
let repeat_n: usize = args.get(2).and_then(|size| size.parse().ok()).unwrap_or(1);
let stdin = std::io::stdin();
let input = std::io::read_to_string(std::io::stdin()).expect("Unable to read from stdin");

let parsed: serde_json::Value =
serde_json::from_reader(stdin).expect("Unable to parse input JSON");
let schema = if let Ok(json) = serde_json::from_str(&input) {
drivel::infer_schema(&json)
} else {
// unable to parse input as JSON; try JSON lines format as fallback
let values = input.lines().map(|line| {
serde_json::from_str(line)
.expect("Unable to parse input; format is neither JSON nor JSON lines")
});
drivel::infer_schema_from_iter(values)
};

match mode.as_str() {
"produce" => {
let parsed = if !parsed.is_array() && repeat_n > 1 {
// if the user wants to repeat the data more than once and we aren't dealing with an array at the root,
// then we wrap the root value in an array first so that downstream we can just expand that array.
serde_json::Value::Array(vec![parsed])
} else {
parsed
let schema = match schema {
SchemaState::Array { .. } => schema,
_ => {
// if the user wants to repeat the data more than once and we aren't dealing
// with an array at the root, then we wrap the state in an array before we
// produce our values
if repeat_n > 1 {
SchemaState::Array { min_length: 1, max_length: 1, schema: Box::new(schema) }
} else {
schema
}
}
};
let schema = drivel::infer_schema(&parsed);

let result = drivel::produce(&schema, repeat_n);
let stdout = std::io::stdout();
serde_json::to_writer_pretty(stdout, &result).unwrap();
},
"describe" => {
let schema = drivel::infer_schema(&parsed);
println!("{}", schema.to_string_pretty());
}
_ => println!("Invalid mode provided. Usage: drivel [mode] <array_length>, where mode is in (describe, produce)")
Expand Down

0 comments on commit e92f23f

Please sign in to comment.