Skip to content

Commit

Permalink
Fixed #213 -- both symbol mode and word mode now allow G1 indicators …
Browse files Browse the repository at this point in the history
…to happen later in the string.

This means that some alternative braille output was generated. In one case (stat_9_7_2), the result is not a listed alternative, but it is reasonable and consistent with other listed results.

Added more contractions
Improved the code to make it easier to include contractions. The main downside is that some code runs on start up that was previously a `const`, but it shouldn't be much time and only runs if UEB is used.

Need to an exceptions list (#223) and then add back the `cc` contractions (commented out due to `arccos`).
  • Loading branch information
NSoiffer committed Nov 16, 2023
1 parent cace890 commit 1c258f4
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 72 deletions.
163 changes: 95 additions & 68 deletions src/braille.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,28 +579,15 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
// The G1 Word mode might not be at the start (iceb.rs:omission_3_6_7)
let grade1_word = try_grade1_word_mode(raw_braille);
debug!("Word mode: '{}'", grade1_word);


// It is pointless to go into passage mode if the internal string is the same as word mode
let mut grade1_passage = "".to_string();
let mut n_blanks = 0;
if grade1_word.chars().any(|ch| {
if ch == 'W' {
n_blanks += 1;
}
n_blanks == 2
}) {
grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
// debug!("Passage mode: '{}'", &grade1_passage);
}
if grade1_passage.is_empty() || grade1_passage == grade1_word {
if !grade1_word.is_empty() {
return grade1_word;
} else {
let grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
return "⠰⠰⠰".to_string() + &grade1_passage + "⠰⠄";
}
}

/// Return true if the BANA guidelines say it is ok to start with grade 2
/// Return true if the BANA or ICEB guidelines say it is ok to start with grade 2
fn is_grade2_string_ok(grade2_braille: &str) -> bool {
// BANA says use grade 2 if there is not more than one grade one symbol or single letter standing alone.
// The exact quote from their guidance:
Expand All @@ -609,12 +596,14 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
// begin the expression with a grade 1 word indicator
// Note: I modified this slightly to exclude the cap indicator in the count. That allows three more ICEB rule to pass and seems
// like it is a reasonable thing to do.
// Another modification is allow a single G1 indicator to occur after whitespace later on
// because ICEB examples show it and it seems better than going to passage mode if it is the only G1 indicator

// Because of the 'L's which go away, we have to put a little more work into finding the first three chars
let chars = grade2_braille.chars().collect::<Vec<char>>();
let mut n_real_chars = 0; // actually number of chars
let mut found_g1 = false;
let mut i = 0; // chars starts on the 4th char
let mut i = 0;
while i < chars.len() {
let ch = chars[i];
if ch == '1' && !is_forced_grade1(&chars, i) {
Expand All @@ -625,24 +614,29 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
} else if !"𝐶CLobc".contains(ch) {
if n_real_chars == 2 {
i += 1;
break; // this is the third real char
break; // this is the third real char
};
n_real_chars += 1;
}
i += 1
}

// if we find another g1 that isn't forced and isn't standing alone, we are done
// to the standing alone rule, I've added a 'follows whitespace' clause for test iceb.rs:omission_3_6_2
// if we find *another* g1 that isn't forced and isn't standing alone, we are done
// I've added a 'follows whitespace' clause for test iceb.rs:omission_3_6_2 to the standing alone rule
// we only allow one standing alone example -- not sure if BANA guidance has this limit, but GTM 11_5_5_3 seems better with it
// Same for GTM 1_7_3_1 (passage mode is mentioned also)
let mut is_standing_alone_already_encountered = false;
let mut is_after_whitespace = false;
while i < chars.len() {
let ch = chars[i];
if ch == '1' && !is_forced_grade1(&chars, i) {
if ch == 'W' {
is_after_whitespace = true;
} else if ch == '1' && !is_forced_grade1(&chars, i) {
if is_standing_alone_already_encountered ||
!((!found_g1 && chars[i-1] == 'W') || is_single_letter_on_right(&chars, i)) {
((found_g1 || !is_after_whitespace) && !is_single_letter_on_right(&chars, i)) {
return false;
}
found_g1 = true;
is_standing_alone_already_encountered = true;
}
i += 1;
Expand Down Expand Up @@ -699,28 +693,26 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
fn try_grade1_word_mode(raw_braille: &str) -> String {
// this isn't quite right, but pretty close -- try splitting at 'W' (words)
// only one of the parts can be in word mode and none of the others can have '1' unless forced
let mut g1_word_braille = "".to_string();
let mut g1_words = Vec::default();
let mut found_word_mode = false;
for raw_word in raw_braille.split('W') {
let word = remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade2, UEB_Duration::Symbol);
debug!("try_grade1_word_mode: word='{}'", word);
if !found_word_mode {
let raw_word_chars = raw_braille.chars().collect::<Vec<char>>();

let needs_word_mode = raw_word_chars.iter().enumerate()
.any(|(i, &ch) | ch == '1' && !is_forced_grade1(&raw_word_chars, i));
if needs_word_mode {
found_word_mode = true;
g1_word_braille.push_str("⠰⠰");
g1_word_braille.push_str(
&remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade1, UEB_Duration::Word)
);
let word_chars = word.chars().collect::<Vec<char>>();
let needs_word_mode = word_chars.iter().enumerate()
.any(|(i, &ch) | ch == '1' && !is_forced_grade1(&word_chars, i));
if needs_word_mode {
if found_word_mode {
return "".to_string();
}
found_word_mode = true;
g1_words.push("⠰⠰".to_string() + &remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade1, UEB_Duration::Word)
);
} else {
g1_words.push(word);
}
g1_word_braille.push_str(&word);
g1_word_braille.push('W');
}
return g1_word_braille;
return if found_word_mode {g1_words.join("W")} else {"".to_string()};
}
}
}
Expand Down Expand Up @@ -796,7 +788,7 @@ fn capitals_to_word_mode(braille: &str) -> String {
if find_next_char(&chars[next_non_cap..], 'C').is_some() { // next letter sequence "C..."
if is_next_char_start_of_section_12_modifier(&chars[next_non_cap+1..]) {
// to me this is tricky -- section 12 modifiers apply to the previous item
// the last clause of the "item" def is the previous "individual symbol" which ICEB 2.1 say is:
// the last clause of the "item" def is the previous adividual symbol" which ICEB 2.1 say is:
// braille sign: one or more consecutive braille characters comprising a unit,
// consisting of a root on its own or a root preceded by one or more
// prefixes (also referred to as braille symbol)
Expand Down Expand Up @@ -1248,43 +1240,78 @@ fn stands_alone(chars: &[char], i: usize) -> (bool, &[char], usize) {
}
}


/// Return a modified result if chars can be contracted.
/// Otherwise, the original string is returned
fn handle_contractions(chars: &[char], mut result: String) -> String {
struct Replacement {
pattern: &'static str,
pattern: String,
replacement: &'static str
}

// It would be much better from an extensibility point of view to read the table in from a file
// FIX: this would be much easier to read/maintain if ASCII braille were used
// FIX: (without the "L"s) and the CONTRACTIONS table built as a lazy static
static CONTRACTIONS: &[Replacement] = &[
Replacement{ pattern: "L⠁L⠝L⠙", replacement: "L⠯" }, // and
Replacement{ pattern: "L⠋L⠕L⠗", replacement: "L⠿" }, // for
Replacement{ pattern: "L⠕L⠋", replacement: "L⠷" }, // of
Replacement{ pattern: "L⠞L⠓L⠑", replacement: "L⠮" }, // the
Replacement{ pattern: "L⠺L⠊L⠞L⠓", replacement: "L⠾" }, // with
Replacement{ pattern: "L⠉L⠓", replacement: "L⠡" }, // ch
Replacement{ pattern: "L⠊L⠝", replacement: "L⠔" }, // in

// cc -- don't match if after/before a cap letter -- no/can't use negative pattern (?!...) in regex package
// figure this out -- also applies to ea, bb, ff, and gg (not that they matter)
// cc may be important for "arccos", but RUEB doesn't apply it to "arccosine", so maybe not
// Replacement{ pattern: "L⠉L⠉", replacement: "L⠒" }, // cc -- don't match if after/before a cap letter


Replacement{ pattern: "L⠎L⠓", replacement: "L⠩" }, // sh
Replacement{ pattern: "L⠁L⠗", replacement: "L⠜" }, // ar
Replacement{ pattern: "L⠑L⠗", replacement: "L⠻" }, // er
Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞", replacement: "${s}L⠰L⠞" }, // ment
Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝", replacement: "${s}L⠰L⠝" } ,// tion
Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)", replacement: "${s}L⠂${e}" }, // ea
const ASCII_TO_UNICODE: &[char] = &[
'⠀', '⠮', '⠐', '⠼', '⠫', '⠩', '⠯', '⠄', '⠷', '⠾', '⠡', '⠬', '⠠', '⠤', '⠨', '⠌',
'⠴', '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠱', '⠰', '⠣', '⠿', '⠜', '⠹',
'⠈', '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', '⠝', '⠕',
'⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', '⠪', '⠳', '⠻', '⠘', '⠸',
];

fn to_unicode_braille(ascii: &str) -> String {
let mut unicode = String::with_capacity(4*ascii.len()); // 'L' + 3 bytes for braille char
for ch in ascii.as_bytes() {
unicode.push('L');
unicode.push(ASCII_TO_UNICODE[(ch.to_ascii_uppercase() - 32) as usize])
}
return unicode;
}

// It would be much better from an extensibility point of view to read the table in from a file
lazy_static! {
static ref CONTRACTION_PATTERNS: RegexSet = init_patterns(CONTRACTIONS);
static ref CONTRACTION_REGEX: Vec<Regex> = init_regex(CONTRACTIONS);
static ref CONTRACTIONS: Vec<Replacement> = vec![
// 10.3: Strong contractions
Replacement{ pattern: to_unicode_braille("and"), replacement: "L⠯"},
Replacement{ pattern: to_unicode_braille("for"), replacement: "L⠿"},
Replacement{ pattern: to_unicode_braille("of"), replacement: "L⠷"},
Replacement{ pattern: to_unicode_braille("the"), replacement: "L⠮"},
Replacement{ pattern: to_unicode_braille("with"), replacement: "L⠾"},

// 10.8: final-letter groupsigns (this need to preceed 'en' and any other shorter contraction)
Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞".to_string(), replacement: "${s}L⠰L⠞" }, // ment
Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝".to_string(), replacement: "${s}L⠰L⠝" } ,// tion

// 10.4: Strong groupsigns
Replacement{ pattern: to_unicode_braille("ch"), replacement: "L⠡"},
Replacement{ pattern: to_unicode_braille("gh"), replacement: "L⠣"},
Replacement{ pattern: to_unicode_braille("sh"), replacement: "L⠩"},
Replacement{ pattern: to_unicode_braille("th"), replacement: "L⠹"},
Replacement{ pattern: to_unicode_braille("wh"), replacement: "L⠱"},
Replacement{ pattern: to_unicode_braille("ed"), replacement: "L⠫"},
Replacement{ pattern: to_unicode_braille("er"), replacement: "L⠻"},
Replacement{ pattern: to_unicode_braille("ou"), replacement: "L⠳"},
Replacement{ pattern: to_unicode_braille("ow"), replacement: "L⠪"},
Replacement{ pattern: to_unicode_braille("st"), replacement: "L⠌"},
Replacement{ pattern: "(?P<s>L.)L⠊L⠝L⠛".to_string(), replacement: "${s}L⠬" }, // 'ing', not at start
Replacement{ pattern: to_unicode_braille("ar"), replacement: "L⠜"},

// 10.6.5: Lower groupsigns preceeded and followed by letters
// FIX: don't match if after/before a cap letter -- can't use negative pattern (?!...) in regex package
// Note: removed cc because "arccos" shouldn't be contracted (10.11.1), but there is no way to know about compound words
// Add it back after implementing a lookup dictionary of exceptions
Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)".to_string(), replacement: "${s}L⠂${e}" }, // ea
Replacement{ pattern: "(?P<s>L.)L⠃L⠃(?P<e>L.)".to_string(), replacement: "${s}L⠆${e}" }, // bb
// Replacement{ pattern: "(?P<s>L.)L⠉L⠉(?P<e>L.)".to_string(), replacement: "${s}L⠒${e}" }, // cc
Replacement{ pattern: "(?P<s>L.)L⠋L⠋(?P<e>L.)".to_string(), replacement: "${s}L⠖${e}" }, // ff
Replacement{ pattern: "(?P<s>L.)L⠛L⠛(?P<e>L.)".to_string(), replacement: "${s}L⠶${e}" }, // gg

// 10.6.8: Lower groupsigns ("in" also 10.5.4 lower wordsigns)
// FIX: these need restrictions about only applying when upper dots are present
Replacement{ pattern: to_unicode_braille("en"), replacement: "⠢"},
Replacement{ pattern: to_unicode_braille("in"), replacement: "⠔"},

];

static ref CONTRACTION_PATTERNS: RegexSet = init_patterns(&CONTRACTIONS);
static ref CONTRACTION_REGEX: Vec<Regex> = init_regex(&CONTRACTIONS);
}

let mut chars_as_str = chars.iter().collect::<String>();
Expand All @@ -1303,17 +1330,17 @@ fn handle_contractions(chars: &[char], mut result: String) -> String {


fn init_patterns(contractions: &[Replacement]) -> RegexSet {
let mut vec = Vec::with_capacity(contractions.len());
let mut vec: Vec<&str> = Vec::with_capacity(contractions.len());
for contraction in contractions {
vec.push(contraction.pattern);
vec.push(&contraction.pattern);
}
return RegexSet::new(&vec).unwrap();
}

fn init_regex(contractions: &[Replacement]) -> Vec<Regex> {
let mut vec = Vec::with_capacity(contractions.len());
for contraction in contractions {
vec.push(Regex::new(contraction.pattern).unwrap());
vec.push(Regex::new(&contraction.pattern).unwrap());
}
return vec;
}
Expand Down
11 changes: 7 additions & 4 deletions tests/braille/UEB/iceb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ fn grade1_1_7_3_1() {
<mo>=</mo>
<msup><mi>x</mi><mn>2</mn></msup>
</math>";
// GTM says it can also be "⠼⠉⠭⠐⠤⠼⠙⠽⠐⠖⠽⠔⠼⠃⠀⠐⠶⠀⠭⠰⠔⠼⠃"
test_braille("UEB", expr, "⠰⠰⠰⠼⠉⠭⠐⠤⠼⠙⠽⠐⠖⠽⠔⠼⠃⠀⠐⠶⠀⠭⠔⠼⠃⠰⠄");
// GTM says it can be either "⠼⠉⠭⠐⠤⠼⠙⠽⠐⠖⠽⠔⠼⠃⠀⠐⠶⠀⠭⠰⠔⠼⠃" or "⠰⠰⠰⠼⠉⠭⠐⠤⠼⠙⠽⠐⠖⠽⠔⠼⠃⠀⠐⠶⠀⠭⠔⠼⠃⠰⠄"
test_braille("UEB", expr, "⠼⠉⠭⠐⠤⠼⠙⠽⠐⠖⠽⠔⠼⠃⠀⠐⠶⠀⠭⠔⠼⠃");
}

#[test]
Expand Down Expand Up @@ -557,7 +557,8 @@ fn fraction_6_4_5() {
#[test]
fn fraction_6_4_6() {
let expr = "<math><mtext>speed</mtext><mo>=</mo><mfrac><mtext>distance</mtext><mtext>time</mtext></mfrac></math>";
test_braille("UEB", expr, "⠰⠰⠰⠎⠏⠑⠑⠙⠀⠐⠶⠀⠷⠙⠊⠎⠞⠁⠝⠉⠑⠨⠌⠞⠊⠍⠑⠾⠰⠄");
// GTM lists two options: "⠎⠏⠑⠫⠀⠐⠶⠀⠰⠰⠷⠙⠊⠎⠞⠁⠝⠉⠑⠨⠌⠞⠊⠍⠑⠾" and "⠰⠰⠰⠎⠏⠑⠑⠙⠀⠐⠶⠀⠷⠙⠊⠎⠞⠁⠝⠉⠑⠨⠌⠞⠊⠍⠑⠾⠰⠄"
test_braille("UEB", expr, "⠎⠏⠑⠫⠀⠐⠶⠀⠰⠰⠷⠙⠊⠎⠞⠁⠝⠉⠑⠨⠌⠞⠊⠍⠑⠾");
}


Expand Down Expand Up @@ -873,7 +874,9 @@ fn text_9_7_1() {
fn stat_9_7_2() {
let expr = "<math><mi>Exp</mi><mo>(</mo><mi>R</mi><mo>)</mo><mo>=</mo>
<mfrac><mi>n</mi><mn>2</mn></mfrac><mo>+</mo><mn>1</mn></math>";
test_braille("UEB", expr, "⠰⠰⠰⠠⠑⠭⠏⠐⠣⠠⠗⠐⠜⠀⠐⠶⠀⠷⠝⠨⠌⠼⠃⠾⠐⠖⠼⠁⠰⠄");
// GTM uses "⠰⠰⠰⠠⠑⠭⠏⠐⠣⠠⠗⠐⠜⠀⠐⠶⠀⠷⠝⠨⠌⠼⠃⠾⠐⠖⠼⠁⠰⠄",
// but "⠠⠑⠭⠏⠐⠣⠠⠗⠐⠜⠀⠐⠶⠀⠰⠰⠷⠝⠨⠌⠼⠃⠾⠐⠖⠼⠁" is shorter and is consistent with omission_3_6_7 and fraction_6_4_6
test_braille("UEB", expr, "⠠⠑⠭⠏⠐⠣⠠⠗⠐⠜⠀⠐⠶⠀⠰⠰⠷⠝⠨⠌⠼⠃⠾⠐⠖⠼⠁");
}

#[test]
Expand Down

0 comments on commit 1c258f4

Please sign in to comment.