Skip to content

Commit

Permalink
better grm formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Mar 14, 2024
1 parent 880ff93 commit 5501135
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 40 deletions.
27 changes: 27 additions & 0 deletions controllers/aici_abi/src/earley/byteset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,31 @@ impl ByteSet {
}
r
}

pub fn num_bytes(&self) -> usize {
let mut r = 0;
for i in 0..BYTESET_LEN {
r += self.mask[i].count_ones() as usize;
}
r
}

pub fn first_byte(&self) -> Option<u8> {
for i in 0..BYTESET_LEN {
let m = self.mask[i];
if m != 0 {
let bit = m.trailing_zeros() as usize;
return Some((i * 32 + bit) as u8);
}
}
None
}

pub fn single_byte(&self) -> Option<u8> {
if self.num_bytes() != 1 {
None
} else {
self.first_byte()
}
}
}
118 changes: 78 additions & 40 deletions controllers/aici_abi/src/earley/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@ impl Rule {
}
}

enum SymName {
Name(String),
Byte(u8),
}

impl SymName {
fn from(name: &str, bytes: Option<&ByteSet>) -> Self {
if let Some(bytes) = bytes {
if let Some(b) = bytes.single_byte() {
return SymName::Byte(b);
}
}
SymName::Name(name.to_string())
}
}

pub struct Grammar {
symbols: Vec<Symbol>,
symbol_by_name: FxHashMap<String, SymIdx>,
Expand Down Expand Up @@ -87,28 +103,18 @@ impl Grammar {
&self.symbols[sym.0 as usize].name
}

fn rule_to_string(&self, rule: &Rule, dot: usize) -> String {
let lhs = self.sym_name(rule.lhs());
let mut rhs = rule
.rhs
.iter()
.enumerate()
.map(|(i, s)| {
format!(
"{}{}",
if i == dot { "(*) " } else { "" },
self.sym_name(*s)
)
})
.collect::<Vec<_>>()
.join(" ");
if rule.rhs.is_empty() {
rhs.push_str("ϵ");
}
if dot == rule.rhs.len() {
rhs.push_str(" (*)");
}
format!("{} ::= {}", lhs, rhs)
fn rule_to_string(&self, rule: &Rule, dot: Option<usize>) -> String {
rule_to_string(
self.sym_name(rule.lhs()),
rule.rhs
.iter()
.map(|s| {
let d = self.sym_data(*s);
SymName::from(&d.name, d.bytes.as_ref())
})
.collect(),
dot,
)
}

fn copy_from(&mut self, other: &Grammar, sym: SymIdx) -> SymIdx {
Expand Down Expand Up @@ -293,7 +299,7 @@ impl Debug for Grammar {
num_rules += sym.rules.len();
}
for rule in &sym.rules {
writeln!(f, "{}", self.rule_to_string(rule, usize::MAX))?;
writeln!(f, "{}", self.rule_to_string(rule, None))?;
}
}
writeln!(
Expand Down Expand Up @@ -532,24 +538,56 @@ impl CGrammar {
pub fn rule_to_string(&self, rule: RuleIdx) -> String {
let lhs = self.sym_name(self.sym_idx_of(rule));
let (rhs, dot) = self.rule_rhs(rule);
let mut rhs_str = rhs
.iter()
.enumerate()
.map(|(i, s)| {
format!(
"{}{}",
if i == dot { "(*) " } else { "" },
self.sym_name(*s)
)
})
.collect::<Vec<_>>()
.join(" ");
if rhs.is_empty() {
rhs_str.push_str("ϵ");
rule_to_string(
lhs,
rhs.iter()
.map(|s| {
let d = self.sym_data(*s);
SymName::from(
&d.name,
if d.is_terminal {
Some(&self.terminals[d.idx.0 as usize])
} else {
None
},
)
})
.collect(),
Some(dot),
)
}
}

fn rule_to_string(lhs: &str, mut rhs: Vec<SymName>, dot: Option<usize>) -> String {
if rhs.is_empty() {
rhs.push(SymName::Name("ϵ".to_string()));
if dot == Some(0) {
rhs.push(SymName::Name("•".to_string()));
}
if dot == rhs.len() {
rhs_str.push_str(" (*)");
} else if let Some(dot) = dot {
rhs.insert(dot, SymName::Name("•".to_string()));
}
let mut outp = Vec::new();
let mut i = 0;
while i < rhs.len() {
match &rhs[i] {
SymName::Name(s) => {
outp.push(s.clone());
i += 1;
}
SymName::Byte(_) => {
let mut text = Vec::new();
while i < rhs.len() {
if let SymName::Byte(b) = rhs[i] {
text.push(b);
i += 1;
} else {
break;
}
}
outp.push(format!("{:?}", String::from_utf8_lossy(&text)));
}
}
format!("{} ::= {}", lhs, rhs_str)
}
format!("{} ::= {}", lhs, outp.join(" "))
}

0 comments on commit 5501135

Please sign in to comment.