From fa298e93c23ea7658e1b8dbf56d2145c6b65976a Mon Sep 17 00:00:00 2001 From: Griffin Berlstein Date: Thu, 14 Nov 2024 15:11:18 +0000 Subject: [PATCH] [Data converter] Improve the hex parsing & general functionality (#2352) I spent some more time hacking on this instead of spending my time in a more productive way. This does the following: - switch the `dat` deserializing from custom matching stuff to a proper `nom` parser that accounts for comments and leading `0x` tags. - `dat` files can now parse values with leading zeroes truncated (though we continue to generate `dat` files with the leading zeroes included) - Output `dat` files will be generated by default as `MEMNAME.dat` though this can be customized with `-e` flag. I.e. `-e out` will generate `MEMNAME.out` - Similarly, when reading in a `dat` directory, the tool will look for `MEMNAME.dat` which can be retargeted via `-e` flag. - The tool will also infer the `--to json` target when given a directory as input --- Cargo.lock | 19 ++- tools/cider-data-converter/Cargo.toml | 1 + tools/cider-data-converter/src/dat_parser.rs | 133 +++++++++++++++++++ tools/cider-data-converter/src/lib.rs | 1 + tools/cider-data-converter/src/main.rs | 90 ++++++++----- 5 files changed, 213 insertions(+), 31 deletions(-) create mode 100644 tools/cider-data-converter/src/dat_parser.rs diff --git a/Cargo.lock b/Cargo.lock index 7b5156844a..7eba776379 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -544,6 +544,7 @@ dependencies = [ "argh", "interp", "itertools 0.11.0", + "nom 7.1.3", "num-bigint", "num-rational", "num-traits", @@ -1623,6 +1624,12 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.2" @@ -1674,6 +1681,16 @@ dependencies = [ "version_check 0.1.5", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -2409,7 +2426,7 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5318bfeed779c64075ce317c81462ed54dc00021be1c6b34957d798e11a68bdb" dependencies = [ - "nom", + "nom 4.2.3", "serde", ] diff --git a/tools/cider-data-converter/Cargo.toml b/tools/cider-data-converter/Cargo.toml index a5755b4066..a690023d34 100644 --- a/tools/cider-data-converter/Cargo.toml +++ b/tools/cider-data-converter/Cargo.toml @@ -17,6 +17,7 @@ thiserror = "1.0.59" num-bigint = { version = "0.4.6" } num-rational = { version = "0.4.2" } num-traits = { version = "0.2.19" } +nom = "7.1.3" [dev-dependencies] proptest = "1.0.0" diff --git a/tools/cider-data-converter/src/dat_parser.rs b/tools/cider-data-converter/src/dat_parser.rs new file mode 100644 index 0000000000..88f99d6ba6 --- /dev/null +++ b/tools/cider-data-converter/src/dat_parser.rs @@ -0,0 +1,133 @@ +use nom::{ + branch::alt, + bytes::complete::{tag, take_while_m_n}, + character::complete::{anychar, line_ending, multispace0}, + combinator::{eof, map_res, opt}, + error::Error, + multi::{many1, many_till}, + sequence::{preceded, tuple}, + IResult, +}; + +fn is_hex_digit(c: char) -> bool { + c.is_ascii_hexdigit() +} + +fn from_hex(input: &str) -> Result { + u8::from_str_radix(input, 16) +} + +fn parse_hex(input: &str) -> IResult<&str, u8> { + map_res(take_while_m_n(1, 2, is_hex_digit), from_hex)(input) +} + +/// Parse a single line of hex characters into a vector of bytes in the order +/// the characters are given, i.e. reversed. +fn hex_line(input: &str) -> IResult<&str, LineOrComment> { + // strip any leading whitespace + let (input, bytes) = preceded( + tuple((multispace0, opt(tag("0x")))), + many1(parse_hex), + )(input)?; + + Ok((input, LineOrComment::Line(bytes))) +} + +fn comment(input: &str) -> IResult<&str, LineOrComment> { + // skip any whitespace + let (input, _) = multispace0(input)?; + let (input, _) = tag("//")(input)?; + let (input, _) = many_till(anychar, alt((line_ending, eof)))(input)?; + Ok((input, LineOrComment::Comment)) +} +/// Parse a line which only contains whitespace +fn empty_line(input: &str) -> IResult<&str, LineOrComment> { + // skip any whitespace + let (input, _) = multispace0(input)?; + Ok((input, LineOrComment::EmptyLine)) +} + +pub fn line_or_comment( + input: &str, +) -> Result>> { + let (_, res) = alt((hex_line, comment, empty_line))(input)?; + Ok(res) +} + +#[derive(Debug, PartialEq)] +pub enum LineOrComment { + Line(Vec), + Comment, + EmptyLine, +} + +/// Parse a single line of hex characters, or a comment. Returns None if it's a +/// comment or an empty line and Some(Vec) if it's a hex line. Panics on a +/// parse error. +/// +/// For the fallible version, see `line_or_comment`. +pub fn unwrap_line_or_comment(input: &str) -> Option> { + match line_or_comment(input).expect("hex parse failed") { + LineOrComment::Line(vec) => Some(vec), + LineOrComment::Comment => None, + LineOrComment::EmptyLine => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_comment() { + assert_eq!(comment("// comment"), Ok(("", LineOrComment::Comment))); + assert_eq!(comment("// comment\n"), Ok(("", LineOrComment::Comment))); + } + + #[test] + fn test_hex_line() { + assert_eq!(hex_line("0x01"), Ok(("", LineOrComment::Line(vec![1])))); + assert_eq!(hex_line("0x02"), Ok(("", LineOrComment::Line(vec![2])))); + assert_eq!(hex_line("0x03"), Ok(("", LineOrComment::Line(vec![3])))); + assert_eq!(hex_line("0x04"), Ok(("", LineOrComment::Line(vec![4])))); + assert_eq!(hex_line("0x05"), Ok(("", LineOrComment::Line(vec![5])))); + assert_eq!(hex_line("0x06"), Ok(("", LineOrComment::Line(vec![6])))); + assert_eq!(hex_line("0x07"), Ok(("", LineOrComment::Line(vec![7])))); + assert_eq!(hex_line("0x08"), Ok(("", LineOrComment::Line(vec![8])))); + assert_eq!(hex_line("0x09"), Ok(("", LineOrComment::Line(vec![9])))); + assert_eq!(hex_line("0x0a"), Ok(("", LineOrComment::Line(vec![10])))); + assert_eq!(hex_line("0x0b"), Ok(("", LineOrComment::Line(vec![11])))); + assert_eq!(hex_line("0x0c"), Ok(("", LineOrComment::Line(vec![12])))); + assert_eq!(hex_line("0x0d"), Ok(("", LineOrComment::Line(vec![13])))); + assert_eq!(hex_line("0x0e"), Ok(("", LineOrComment::Line(vec![14])))); + assert_eq!(hex_line("0x0f"), Ok(("", LineOrComment::Line(vec![15])))); + assert_eq!(hex_line("0xff"), Ok(("", LineOrComment::Line(vec![255])))); + assert_eq!( + hex_line("0x00ff"), + Ok(("", LineOrComment::Line(vec![0, 255]))) + ); + } + + #[test] + fn test_from_hex() { + assert_eq!(from_hex("0"), Ok(0)); + assert_eq!(from_hex("1"), Ok(1)); + assert_eq!(from_hex("2"), Ok(2)); + assert_eq!(from_hex("3"), Ok(3)); + assert_eq!(from_hex("4"), Ok(4)); + assert_eq!(from_hex("5"), Ok(5)); + assert_eq!(from_hex("6"), Ok(6)); + assert_eq!(from_hex("7"), Ok(7)); + assert_eq!(from_hex("8"), Ok(8)); + assert_eq!(from_hex("9"), Ok(9)); + assert_eq!(from_hex("a"), Ok(10)); + assert_eq!(from_hex("b"), Ok(11)); + assert_eq!(from_hex("c"), Ok(12)); + assert_eq!(from_hex("d"), Ok(13)); + assert_eq!(from_hex("e"), Ok(14)); + assert_eq!(from_hex("f"), Ok(15)); + + assert_eq!(from_hex("FF"), Ok(255)); + assert_eq!(from_hex("ff"), Ok(255)); + } +} diff --git a/tools/cider-data-converter/src/lib.rs b/tools/cider-data-converter/src/lib.rs index 7e1dadc1bd..31092d73e6 100644 --- a/tools/cider-data-converter/src/lib.rs +++ b/tools/cider-data-converter/src/lib.rs @@ -1,2 +1,3 @@ pub mod converter; +pub mod dat_parser; pub mod json_data; diff --git a/tools/cider-data-converter/src/main.rs b/tools/cider-data-converter/src/main.rs index e44fea9cb6..a95d910e83 100644 --- a/tools/cider-data-converter/src/main.rs +++ b/tools/cider-data-converter/src/main.rs @@ -1,11 +1,13 @@ use argh::FromArgs; -use cider_data_converter::{converter, json_data::JsonData}; +use cider_data_converter::{ + converter, dat_parser::unwrap_line_or_comment, json_data::JsonData, +}; use core::str; use interp::serialization::{self, DataDump, SerializationError}; -use itertools::Itertools; use std::{ fs::File, io::{self, BufRead, BufReader, BufWriter, Read, Write}, + iter::repeat, path::PathBuf, str::FromStr, }; @@ -13,6 +15,7 @@ use thiserror::Error; const JSON_EXTENSION: &str = "data"; const CIDER_EXTENSION: &str = "dump"; +const DAT_EXTENSION: &str = "dat"; const HEADER_FILENAME: &str = "header"; @@ -32,6 +35,14 @@ enum CiderDataConverterError { #[error(transparent)] DataDumpError(#[from] SerializationError), + + #[error( + "Missing output path. This is required for the \"to dat\" conversion" + )] + MissingDatOutputPath, + + #[error("Output path for \"to dat\" exists but it is a file")] + DatOutputPathIsFile, } impl std::fmt::Debug for CiderDataConverterError { @@ -90,6 +101,12 @@ struct Opts { /// exists solely for backwards compatibility with the old display format. #[argh(switch, long = "legacy-quotes")] use_quotes: bool, + + /// the file extension to use for the output/input file when parsing to and + /// from the dat target. If not provided, the extension is assumed to be .dat + #[argh(option, short = 'e', long = "dat-file-extension")] + #[argh(default = "String::from(DAT_EXTENSION)")] + file_extension: String, } fn main() -> Result<(), CiderDataConverterError> { @@ -97,19 +114,27 @@ fn main() -> Result<(), CiderDataConverterError> { // if no action is specified, try to guess based on file extensions if opts.action.is_none() + // input is .json && (opts.input_path.as_ref().is_some_and(|x| { x.extension().map_or(false, |y| y == JSON_EXTENSION) - }) || opts.output_path.as_ref().is_some_and(|x| { + }) + // output is .dump + || opts.output_path.as_ref().is_some_and(|x| { x.extension().map_or(false, |y| y == CIDER_EXTENSION) })) { opts.action = Some(Target::DataDump); } else if opts.action.is_none() + // output is .json && (opts.output_path.as_ref().is_some_and(|x| { x.extension().map_or(false, |x| x == JSON_EXTENSION) - }) || opts.input_path.as_ref().is_some_and(|x| { + }) + // input is .dump + || opts.input_path.as_ref().is_some_and(|x| { x.extension().map_or(false, |x| x == CIDER_EXTENSION) - })) + }) + // input is a directory (suggesting a deserialization from dat) + || opts.input_path.as_ref().is_some_and(|x| x.is_dir())) { opts.action = Some(Target::Json); } @@ -144,30 +169,31 @@ fn main() -> Result<(), CiderDataConverterError> { for mem_dec in &header.memories { let starting_len = data.len(); let mem_file = BufReader::new(File::open( - path.join(&mem_dec.name), + path.join(format!( + "{}.{}", + mem_dec.name, opts.file_extension + )), )?); - let mut line_data = vec![]; for line in mem_file.lines() { let line = line?; - for pair in &line.chars().chunks(2) { - // there has got to be a better way to do this... - let string = - pair.into_iter().collect::(); - let val = u8::from_str_radix(&string, 16) - .expect("invalid hex"); - line_data.push(val); + if let Some(line_data) = + unwrap_line_or_comment(&line) + { + assert!( + line_data.len() + <= mem_dec.bytes_per_entry() + as usize, + "line data too long" + ); + + let padding = (mem_dec.bytes_per_entry() + as usize) + - line_data.len(); + + data.extend(line_data.into_iter().rev()); + data.extend(repeat(0u8).take(padding)) } - // TODO griffin: handle inputs that are - // truncated or otherwise shorter than expected - - assert!( - line_data.len() - == (mem_dec.bytes_per_entry() as usize) - ); - // reverse the byte order to get the expected - // little endian and reuse the vec - data.extend(line_data.drain(..).rev()) } assert_eq!( @@ -213,17 +239,22 @@ fn main() -> Result<(), CiderDataConverterError> { if let Some(path) = opts.output_path { if path.exists() && !path.is_dir() { - // TODO griffin: Make this an actual error - panic!("Output path exists but is not a directory") + return Err( + CiderDataConverterError::DatOutputPathIsFile, + ); } else if !path.exists() { std::fs::create_dir(&path)?; } - let mut header_output = File::create(path.join("header"))?; + let mut header_output = + File::create(path.join(HEADER_FILENAME))?; header_output.write_all(&data.header.serialize()?)?; for memory in &data.header.memories { - let file = File::create(path.join(&memory.name))?; + let file = File::create(path.join(format!( + "{}.{}", + memory.name, opts.file_extension + )))?; let mut writer = BufWriter::new(file); for bytes in data .get_data(&memory.name) @@ -243,8 +274,7 @@ fn main() -> Result<(), CiderDataConverterError> { } } } else { - // TODO griffin: Make this an actual error - panic!("Output path not specified, this is required for the dat target") + return Err(CiderDataConverterError::MissingDatOutputPath); } } }