From fa298e93c23ea7658e1b8dbf56d2145c6b65976a Mon Sep 17 00:00:00 2001
From: Griffin Berlstein <griffin@berlste.in>
Date: Thu, 14 Nov 2024 15:11:18 +0000
Subject: [PATCH] [Data converter] Improve the hex parsing & general
 functionality (#2352)

I spent some more time hacking on this instead of spending my time in a
more productive way. This does the following:
- switch the `dat` deserializing from custom matching stuff to a proper
`nom` parser that accounts for comments and leading `0x` tags.
- `dat` files can now parse values with leading zeroes truncated (though
we continue to generate `dat` files with the leading zeroes included)
- Output `dat` files will be generated by default as `MEMNAME.dat`
though this can be customized with `-e` flag. I.e. `-e out` will
generate `MEMNAME.out`
- Similarly, when reading in a `dat` directory, the tool will look for
`MEMNAME.dat` which can be retargeted via `-e` flag.
- The tool will also infer the `--to json` target when given a directory
as input
---
 Cargo.lock                                   |  19 ++-
 tools/cider-data-converter/Cargo.toml        |   1 +
 tools/cider-data-converter/src/dat_parser.rs | 133 +++++++++++++++++++
 tools/cider-data-converter/src/lib.rs        |   1 +
 tools/cider-data-converter/src/main.rs       |  90 ++++++++-----
 5 files changed, 213 insertions(+), 31 deletions(-)
 create mode 100644 tools/cider-data-converter/src/dat_parser.rs

diff --git a/Cargo.lock b/Cargo.lock
index 7b5156844a..7eba776379 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -544,6 +544,7 @@ dependencies = [
  "argh",
  "interp",
  "itertools 0.11.0",
+ "nom 7.1.3",
  "num-bigint",
  "num-rational",
  "num-traits",
@@ -1623,6 +1624,12 @@ version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
 
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
 [[package]]
 name = "miniz_oxide"
 version = "0.7.2"
@@ -1674,6 +1681,16 @@ dependencies = [
  "version_check 0.1.5",
 ]
 
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
 [[package]]
 name = "nu-ansi-term"
 version = "0.46.0"
@@ -2409,7 +2426,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5318bfeed779c64075ce317c81462ed54dc00021be1c6b34957d798e11a68bdb"
 dependencies = [
- "nom",
+ "nom 4.2.3",
  "serde",
 ]
 
diff --git a/tools/cider-data-converter/Cargo.toml b/tools/cider-data-converter/Cargo.toml
index a5755b4066..a690023d34 100644
--- a/tools/cider-data-converter/Cargo.toml
+++ b/tools/cider-data-converter/Cargo.toml
@@ -17,6 +17,7 @@ thiserror = "1.0.59"
 num-bigint = { version = "0.4.6" }
 num-rational = { version = "0.4.2" }
 num-traits = { version = "0.2.19" }
+nom = "7.1.3"
 
 [dev-dependencies]
 proptest = "1.0.0"
diff --git a/tools/cider-data-converter/src/dat_parser.rs b/tools/cider-data-converter/src/dat_parser.rs
new file mode 100644
index 0000000000..88f99d6ba6
--- /dev/null
+++ b/tools/cider-data-converter/src/dat_parser.rs
@@ -0,0 +1,133 @@
+use nom::{
+    branch::alt,
+    bytes::complete::{tag, take_while_m_n},
+    character::complete::{anychar, line_ending, multispace0},
+    combinator::{eof, map_res, opt},
+    error::Error,
+    multi::{many1, many_till},
+    sequence::{preceded, tuple},
+    IResult,
+};
+
+fn is_hex_digit(c: char) -> bool {
+    c.is_ascii_hexdigit()
+}
+
+fn from_hex(input: &str) -> Result<u8, std::num::ParseIntError> {
+    u8::from_str_radix(input, 16)
+}
+
+fn parse_hex(input: &str) -> IResult<&str, u8> {
+    map_res(take_while_m_n(1, 2, is_hex_digit), from_hex)(input)
+}
+
+/// Parse a single line of hex characters into a vector of bytes in the order
+/// the characters are given, i.e. reversed.
+fn hex_line(input: &str) -> IResult<&str, LineOrComment> {
+    // strip any leading whitespace
+    let (input, bytes) = preceded(
+        tuple((multispace0, opt(tag("0x")))),
+        many1(parse_hex),
+    )(input)?;
+
+    Ok((input, LineOrComment::Line(bytes)))
+}
+
+fn comment(input: &str) -> IResult<&str, LineOrComment> {
+    // skip any whitespace
+    let (input, _) = multispace0(input)?;
+    let (input, _) = tag("//")(input)?;
+    let (input, _) = many_till(anychar, alt((line_ending, eof)))(input)?;
+    Ok((input, LineOrComment::Comment))
+}
+/// Parse a line which only contains whitespace
+fn empty_line(input: &str) -> IResult<&str, LineOrComment> {
+    // skip any whitespace
+    let (input, _) = multispace0(input)?;
+    Ok((input, LineOrComment::EmptyLine))
+}
+
+pub fn line_or_comment(
+    input: &str,
+) -> Result<LineOrComment, nom::Err<Error<&str>>> {
+    let (_, res) = alt((hex_line, comment, empty_line))(input)?;
+    Ok(res)
+}
+
+#[derive(Debug, PartialEq)]
+pub enum LineOrComment {
+    Line(Vec<u8>),
+    Comment,
+    EmptyLine,
+}
+
+/// Parse a single line of hex characters, or a comment. Returns None if it's a
+/// comment or an empty line and Some(Vec<u8>) if it's a hex line. Panics on a
+/// parse error.
+///
+/// For the fallible version, see `line_or_comment`.
+pub fn unwrap_line_or_comment(input: &str) -> Option<Vec<u8>> {
+    match line_or_comment(input).expect("hex parse failed") {
+        LineOrComment::Line(vec) => Some(vec),
+        LineOrComment::Comment => None,
+        LineOrComment::EmptyLine => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_comment() {
+        assert_eq!(comment("// comment"), Ok(("", LineOrComment::Comment)));
+        assert_eq!(comment("// comment\n"), Ok(("", LineOrComment::Comment)));
+    }
+
+    #[test]
+    fn test_hex_line() {
+        assert_eq!(hex_line("0x01"), Ok(("", LineOrComment::Line(vec![1]))));
+        assert_eq!(hex_line("0x02"), Ok(("", LineOrComment::Line(vec![2]))));
+        assert_eq!(hex_line("0x03"), Ok(("", LineOrComment::Line(vec![3]))));
+        assert_eq!(hex_line("0x04"), Ok(("", LineOrComment::Line(vec![4]))));
+        assert_eq!(hex_line("0x05"), Ok(("", LineOrComment::Line(vec![5]))));
+        assert_eq!(hex_line("0x06"), Ok(("", LineOrComment::Line(vec![6]))));
+        assert_eq!(hex_line("0x07"), Ok(("", LineOrComment::Line(vec![7]))));
+        assert_eq!(hex_line("0x08"), Ok(("", LineOrComment::Line(vec![8]))));
+        assert_eq!(hex_line("0x09"), Ok(("", LineOrComment::Line(vec![9]))));
+        assert_eq!(hex_line("0x0a"), Ok(("", LineOrComment::Line(vec![10]))));
+        assert_eq!(hex_line("0x0b"), Ok(("", LineOrComment::Line(vec![11]))));
+        assert_eq!(hex_line("0x0c"), Ok(("", LineOrComment::Line(vec![12]))));
+        assert_eq!(hex_line("0x0d"), Ok(("", LineOrComment::Line(vec![13]))));
+        assert_eq!(hex_line("0x0e"), Ok(("", LineOrComment::Line(vec![14]))));
+        assert_eq!(hex_line("0x0f"), Ok(("", LineOrComment::Line(vec![15]))));
+        assert_eq!(hex_line("0xff"), Ok(("", LineOrComment::Line(vec![255]))));
+        assert_eq!(
+            hex_line("0x00ff"),
+            Ok(("", LineOrComment::Line(vec![0, 255])))
+        );
+    }
+
+    #[test]
+    fn test_from_hex() {
+        assert_eq!(from_hex("0"), Ok(0));
+        assert_eq!(from_hex("1"), Ok(1));
+        assert_eq!(from_hex("2"), Ok(2));
+        assert_eq!(from_hex("3"), Ok(3));
+        assert_eq!(from_hex("4"), Ok(4));
+        assert_eq!(from_hex("5"), Ok(5));
+        assert_eq!(from_hex("6"), Ok(6));
+        assert_eq!(from_hex("7"), Ok(7));
+        assert_eq!(from_hex("8"), Ok(8));
+        assert_eq!(from_hex("9"), Ok(9));
+        assert_eq!(from_hex("a"), Ok(10));
+        assert_eq!(from_hex("b"), Ok(11));
+        assert_eq!(from_hex("c"), Ok(12));
+        assert_eq!(from_hex("d"), Ok(13));
+        assert_eq!(from_hex("e"), Ok(14));
+        assert_eq!(from_hex("f"), Ok(15));
+
+        assert_eq!(from_hex("FF"), Ok(255));
+        assert_eq!(from_hex("ff"), Ok(255));
+    }
+}
diff --git a/tools/cider-data-converter/src/lib.rs b/tools/cider-data-converter/src/lib.rs
index 7e1dadc1bd..31092d73e6 100644
--- a/tools/cider-data-converter/src/lib.rs
+++ b/tools/cider-data-converter/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod converter;
+pub mod dat_parser;
 pub mod json_data;
diff --git a/tools/cider-data-converter/src/main.rs b/tools/cider-data-converter/src/main.rs
index e44fea9cb6..a95d910e83 100644
--- a/tools/cider-data-converter/src/main.rs
+++ b/tools/cider-data-converter/src/main.rs
@@ -1,11 +1,13 @@
 use argh::FromArgs;
-use cider_data_converter::{converter, json_data::JsonData};
+use cider_data_converter::{
+    converter, dat_parser::unwrap_line_or_comment, json_data::JsonData,
+};
 use core::str;
 use interp::serialization::{self, DataDump, SerializationError};
-use itertools::Itertools;
 use std::{
     fs::File,
     io::{self, BufRead, BufReader, BufWriter, Read, Write},
+    iter::repeat,
     path::PathBuf,
     str::FromStr,
 };
@@ -13,6 +15,7 @@ use thiserror::Error;
 
 const JSON_EXTENSION: &str = "data";
 const CIDER_EXTENSION: &str = "dump";
+const DAT_EXTENSION: &str = "dat";
 
 const HEADER_FILENAME: &str = "header";
 
@@ -32,6 +35,14 @@ enum CiderDataConverterError {
 
     #[error(transparent)]
     DataDumpError(#[from] SerializationError),
+
+    #[error(
+        "Missing output path. This is required for the \"to dat\" conversion"
+    )]
+    MissingDatOutputPath,
+
+    #[error("Output path for \"to dat\" exists but it is a file")]
+    DatOutputPathIsFile,
 }
 
 impl std::fmt::Debug for CiderDataConverterError {
@@ -90,6 +101,12 @@ struct Opts {
     /// exists solely for backwards compatibility with the old display format.
     #[argh(switch, long = "legacy-quotes")]
     use_quotes: bool,
+
+    /// the file extension to use for the output/input file when parsing to and
+    /// from the dat target. If not provided, the extension is assumed to be .dat
+    #[argh(option, short = 'e', long = "dat-file-extension")]
+    #[argh(default = "String::from(DAT_EXTENSION)")]
+    file_extension: String,
 }
 
 fn main() -> Result<(), CiderDataConverterError> {
@@ -97,19 +114,27 @@ fn main() -> Result<(), CiderDataConverterError> {
 
     // if no action is specified, try to guess based on file extensions
     if opts.action.is_none()
+        // input is .json
         && (opts.input_path.as_ref().is_some_and(|x| {
             x.extension().map_or(false, |y| y == JSON_EXTENSION)
-        }) || opts.output_path.as_ref().is_some_and(|x| {
+        })
+        // output is .dump
+        || opts.output_path.as_ref().is_some_and(|x| {
             x.extension().map_or(false, |y| y == CIDER_EXTENSION)
         }))
     {
         opts.action = Some(Target::DataDump);
     } else if opts.action.is_none()
+        // output is .json
         && (opts.output_path.as_ref().is_some_and(|x| {
             x.extension().map_or(false, |x| x == JSON_EXTENSION)
-        }) || opts.input_path.as_ref().is_some_and(|x| {
+        })
+        // input is .dump
+        || opts.input_path.as_ref().is_some_and(|x| {
             x.extension().map_or(false, |x| x == CIDER_EXTENSION)
-        }))
+        })
+        // input is a directory (suggesting a deserialization from dat)
+        || opts.input_path.as_ref().is_some_and(|x| x.is_dir()))
     {
         opts.action = Some(Target::Json);
     }
@@ -144,30 +169,31 @@ fn main() -> Result<(), CiderDataConverterError> {
                         for mem_dec in &header.memories {
                             let starting_len = data.len();
                             let mem_file = BufReader::new(File::open(
-                                path.join(&mem_dec.name),
+                                path.join(format!(
+                                    "{}.{}",
+                                    mem_dec.name, opts.file_extension
+                                )),
                             )?);
 
-                            let mut line_data = vec![];
                             for line in mem_file.lines() {
                                 let line = line?;
-                                for pair in &line.chars().chunks(2) {
-                                    // there has got to be a better way to do this...
-                                    let string =
-                                        pair.into_iter().collect::<String>();
-                                    let val = u8::from_str_radix(&string, 16)
-                                        .expect("invalid hex");
-                                    line_data.push(val);
+                                if let Some(line_data) =
+                                    unwrap_line_or_comment(&line)
+                                {
+                                    assert!(
+                                        line_data.len()
+                                            <= mem_dec.bytes_per_entry()
+                                                as usize,
+                                        "line data too long"
+                                    );
+
+                                    let padding = (mem_dec.bytes_per_entry()
+                                        as usize)
+                                        - line_data.len();
+
+                                    data.extend(line_data.into_iter().rev());
+                                    data.extend(repeat(0u8).take(padding))
                                 }
-                                // TODO griffin: handle inputs that are
-                                // truncated or otherwise shorter than expected
-
-                                assert!(
-                                    line_data.len()
-                                        == (mem_dec.bytes_per_entry() as usize)
-                                );
-                                // reverse the byte order to get the expected
-                                // little endian and reuse the vec
-                                data.extend(line_data.drain(..).rev())
                             }
 
                             assert_eq!(
@@ -213,17 +239,22 @@ fn main() -> Result<(), CiderDataConverterError> {
 
                 if let Some(path) = opts.output_path {
                     if path.exists() && !path.is_dir() {
-                        // TODO griffin: Make this an actual error
-                        panic!("Output path exists but is not a directory")
+                        return Err(
+                            CiderDataConverterError::DatOutputPathIsFile,
+                        );
                     } else if !path.exists() {
                         std::fs::create_dir(&path)?;
                     }
 
-                    let mut header_output = File::create(path.join("header"))?;
+                    let mut header_output =
+                        File::create(path.join(HEADER_FILENAME))?;
                     header_output.write_all(&data.header.serialize()?)?;
 
                     for memory in &data.header.memories {
-                        let file = File::create(path.join(&memory.name))?;
+                        let file = File::create(path.join(format!(
+                            "{}.{}",
+                            memory.name, opts.file_extension
+                        )))?;
                         let mut writer = BufWriter::new(file);
                         for bytes in data
                             .get_data(&memory.name)
@@ -243,8 +274,7 @@ fn main() -> Result<(), CiderDataConverterError> {
                         }
                     }
                 } else {
-                    // TODO griffin: Make this an actual error
-                    panic!("Output path not specified, this is required for the dat target")
+                    return Err(CiderDataConverterError::MissingDatOutputPath);
                 }
             }
         }