From 550113592316fd35393bc3ad91237861d1a1535a Mon Sep 17 00:00:00 2001
From: Michal Moskal <michal@moskal.me>
Date: Thu, 14 Mar 2024 18:33:13 +0000
Subject: [PATCH] better grm formatting

---
 controllers/aici_abi/src/earley/byteset.rs |  27 +++++
 controllers/aici_abi/src/earley/grammar.rs | 118 ++++++++++++++-------
 2 files changed, 105 insertions(+), 40 deletions(-)
diff --git a/controllers/aici_abi/src/earley/byteset.rs b/controllers/aici_abi/src/earley/byteset.rs
index dc578e1a..94e7e49c 100644
--- a/controllers/aici_abi/src/earley/byteset.rs
+++ b/controllers/aici_abi/src/earley/byteset.rs
@@ -104,4 +104,31 @@ impl ByteSet {
         }
         r
     }
+
+    pub fn num_bytes(&self) -> usize {
+        let mut r = 0;
+        for i in 0..BYTESET_LEN {
+            r += self.mask[i].count_ones() as usize;
+        }
+        r
+    }
+
+    pub fn first_byte(&self) -> Option<u8> {
+        for i in 0..BYTESET_LEN {
+            let m = self.mask[i];
+            if m != 0 {
+                let bit = m.trailing_zeros() as usize;
+                return Some((i * 32 + bit) as u8);
+            }
+        }
+        None
+    }
+
+    pub fn single_byte(&self) -> Option<u8> {
+        if self.num_bytes() != 1 {
+            None
+        } else {
+            self.first_byte()
+        }
+    }
 }
diff --git a/controllers/aici_abi/src/earley/grammar.rs b/controllers/aici_abi/src/earley/grammar.rs
index 304e4007..8a3543ea 100644
--- a/controllers/aici_abi/src/earley/grammar.rs
+++ b/controllers/aici_abi/src/earley/grammar.rs
@@ -32,6 +32,22 @@ impl Rule {
     }
 }
 
+enum SymName {
+    Name(String),
+    Byte(u8),
+}
+
+impl SymName {
+    fn from(name: &str, bytes: Option<&ByteSet>) -> Self {
+        if let Some(bytes) = bytes {
+            if let Some(b) = bytes.single_byte() {
+                return SymName::Byte(b);
+            }
+        }
+        SymName::Name(name.to_string())
+    }
+}
+
 pub struct Grammar {
     symbols: Vec<Symbol>,
     symbol_by_name: FxHashMap<String, SymIdx>,
@@ -87,28 +103,18 @@ impl Grammar {
         &self.symbols[sym.0 as usize].name
     }
 
-    fn rule_to_string(&self, rule: &Rule, dot: usize) -> String {
-        let lhs = self.sym_name(rule.lhs());
-        let mut rhs = rule
-            .rhs
-            .iter()
-            .enumerate()
-            .map(|(i, s)| {
-                format!(
-                    "{}{}",
-                    if i == dot { "(*) " } else { "" },
-                    self.sym_name(*s)
-                )
-            })
-            .collect::<Vec<_>>()
-            .join(" ");
-        if rule.rhs.is_empty() {
-            rhs.push_str("ϵ");
-        }
-        if dot == rule.rhs.len() {
-            rhs.push_str(" (*)");
-        }
-        format!("{} ::= {}", lhs, rhs)
+    fn rule_to_string(&self, rule: &Rule, dot: Option<usize>) -> String {
+        rule_to_string(
+            self.sym_name(rule.lhs()),
+            rule.rhs
+                .iter()
+                .map(|s| {
+                    let d = self.sym_data(*s);
+                    SymName::from(&d.name, d.bytes.as_ref())
+                })
+                .collect(),
+            dot,
+        )
     }
 
     fn copy_from(&mut self, other: &Grammar, sym: SymIdx) -> SymIdx {
@@ -293,7 +299,7 @@ impl Debug for Grammar {
                 num_rules += sym.rules.len();
             }
             for rule in &sym.rules {
-                writeln!(f, "{}", self.rule_to_string(rule, usize::MAX))?;
+                writeln!(f, "{}", self.rule_to_string(rule, None))?;
             }
         }
         writeln!(
@@ -532,24 +538,56 @@ impl CGrammar {
     pub fn rule_to_string(&self, rule: RuleIdx) -> String {
         let lhs = self.sym_name(self.sym_idx_of(rule));
         let (rhs, dot) = self.rule_rhs(rule);
-        let mut rhs_str = rhs
-            .iter()
-            .enumerate()
-            .map(|(i, s)| {
-                format!(
-                    "{}{}",
-                    if i == dot { "(*) " } else { "" },
-                    self.sym_name(*s)
-                )
-            })
-            .collect::<Vec<_>>()
-            .join(" ");
-        if rhs.is_empty() {
-            rhs_str.push_str("ϵ");
+        rule_to_string(
+            lhs,
+            rhs.iter()
+                .map(|s| {
+                    let d = self.sym_data(*s);
+                    SymName::from(
+                        &d.name,
+                        if d.is_terminal {
+                            Some(&self.terminals[d.idx.0 as usize])
+                        } else {
+                            None
+                        },
+                    )
+                })
+                .collect(),
+            Some(dot),
+        )
+    }
+}
+
+fn rule_to_string(lhs: &str, mut rhs: Vec<SymName>, dot: Option<usize>) -> String {
+    if rhs.is_empty() {
+        rhs.push(SymName::Name("ϵ".to_string()));
+        if dot == Some(0) {
+            rhs.push(SymName::Name("•".to_string()));
         }
-        if dot == rhs.len() {
-            rhs_str.push_str(" (*)");
+    } else if let Some(dot) = dot {
+        rhs.insert(dot, SymName::Name("•".to_string()));
+    }
+    let mut outp = Vec::new();
+    let mut i = 0;
+    while i < rhs.len() {
+        match &rhs[i] {
+            SymName::Name(s) => {
+                outp.push(s.clone());
+                i += 1;
+            }
+            SymName::Byte(_) => {
+                let mut text = Vec::new();
+                while i < rhs.len() {
+                    if let SymName::Byte(b) = rhs[i] {
+                        text.push(b);
+                        i += 1;
+                    } else {
+                        break;
+                    }
+                }
+                outp.push(format!("{:?}", String::from_utf8_lossy(&text)));
+            }
         }
-        format!("{} ::= {}", lhs, rhs_str)
     }
+    format!("{} ::= {}", lhs, outp.join(" "))
 }