From 550113592316fd35393bc3ad91237861d1a1535a Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Thu, 14 Mar 2024 18:33:13 +0000 Subject: [PATCH] better grm formatting --- controllers/aici_abi/src/earley/byteset.rs | 27 +++++ controllers/aici_abi/src/earley/grammar.rs | 118 ++++++++++++++------- 2 files changed, 105 insertions(+), 40 deletions(-) diff --git a/controllers/aici_abi/src/earley/byteset.rs b/controllers/aici_abi/src/earley/byteset.rs index dc578e1a..94e7e49c 100644 --- a/controllers/aici_abi/src/earley/byteset.rs +++ b/controllers/aici_abi/src/earley/byteset.rs @@ -104,4 +104,31 @@ impl ByteSet { } r } + + pub fn num_bytes(&self) -> usize { + let mut r = 0; + for i in 0..BYTESET_LEN { + r += self.mask[i].count_ones() as usize; + } + r + } + + pub fn first_byte(&self) -> Option { + for i in 0..BYTESET_LEN { + let m = self.mask[i]; + if m != 0 { + let bit = m.trailing_zeros() as usize; + return Some((i * 32 + bit) as u8); + } + } + None + } + + pub fn single_byte(&self) -> Option { + if self.num_bytes() != 1 { + None + } else { + self.first_byte() + } + } } diff --git a/controllers/aici_abi/src/earley/grammar.rs b/controllers/aici_abi/src/earley/grammar.rs index 304e4007..8a3543ea 100644 --- a/controllers/aici_abi/src/earley/grammar.rs +++ b/controllers/aici_abi/src/earley/grammar.rs @@ -32,6 +32,22 @@ impl Rule { } } +enum SymName { + Name(String), + Byte(u8), +} + +impl SymName { + fn from(name: &str, bytes: Option<&ByteSet>) -> Self { + if let Some(bytes) = bytes { + if let Some(b) = bytes.single_byte() { + return SymName::Byte(b); + } + } + SymName::Name(name.to_string()) + } +} + pub struct Grammar { symbols: Vec, symbol_by_name: FxHashMap, @@ -87,28 +103,18 @@ impl Grammar { &self.symbols[sym.0 as usize].name } - fn rule_to_string(&self, rule: &Rule, dot: usize) -> String { - let lhs = self.sym_name(rule.lhs()); - let mut rhs = rule - .rhs - .iter() - .enumerate() - .map(|(i, s)| { - format!( - "{}{}", - if i == dot { "(*) " } else { "" }, - self.sym_name(*s) - ) - }) - .collect::>() - .join(" "); - if rule.rhs.is_empty() { - rhs.push_str("ϵ"); - } - if dot == rule.rhs.len() { - rhs.push_str(" (*)"); - } - format!("{} ::= {}", lhs, rhs) + fn rule_to_string(&self, rule: &Rule, dot: Option) -> String { + rule_to_string( + self.sym_name(rule.lhs()), + rule.rhs + .iter() + .map(|s| { + let d = self.sym_data(*s); + SymName::from(&d.name, d.bytes.as_ref()) + }) + .collect(), + dot, + ) } fn copy_from(&mut self, other: &Grammar, sym: SymIdx) -> SymIdx { @@ -293,7 +299,7 @@ impl Debug for Grammar { num_rules += sym.rules.len(); } for rule in &sym.rules { - writeln!(f, "{}", self.rule_to_string(rule, usize::MAX))?; + writeln!(f, "{}", self.rule_to_string(rule, None))?; } } writeln!( @@ -532,24 +538,56 @@ impl CGrammar { pub fn rule_to_string(&self, rule: RuleIdx) -> String { let lhs = self.sym_name(self.sym_idx_of(rule)); let (rhs, dot) = self.rule_rhs(rule); - let mut rhs_str = rhs - .iter() - .enumerate() - .map(|(i, s)| { - format!( - "{}{}", - if i == dot { "(*) " } else { "" }, - self.sym_name(*s) - ) - }) - .collect::>() - .join(" "); - if rhs.is_empty() { - rhs_str.push_str("ϵ"); + rule_to_string( + lhs, + rhs.iter() + .map(|s| { + let d = self.sym_data(*s); + SymName::from( + &d.name, + if d.is_terminal { + Some(&self.terminals[d.idx.0 as usize]) + } else { + None + }, + ) + }) + .collect(), + Some(dot), + ) + } +} + +fn rule_to_string(lhs: &str, mut rhs: Vec, dot: Option) -> String { + if rhs.is_empty() { + rhs.push(SymName::Name("ϵ".to_string())); + if dot == Some(0) { + rhs.push(SymName::Name("•".to_string())); } - if dot == rhs.len() { - rhs_str.push_str(" (*)"); + } else if let Some(dot) = dot { + rhs.insert(dot, SymName::Name("•".to_string())); + } + let mut outp = Vec::new(); + let mut i = 0; + while i < rhs.len() { + match &rhs[i] { + SymName::Name(s) => { + outp.push(s.clone()); + i += 1; + } + SymName::Byte(_) => { + let mut text = Vec::new(); + while i < rhs.len() { + if let SymName::Byte(b) = rhs[i] { + text.push(b); + i += 1; + } else { + break; + } + } + outp.push(format!("{:?}", String::from_utf8_lossy(&text))); + } } - format!("{} ::= {}", lhs, rhs_str) } + format!("{} ::= {}", lhs, outp.join(" ")) }