Merge pull request #224 from NSoiffer/UEB

UEB -- adds more cases where passage mode is not used along with more contractions
NSoiffer · Nov 16, 2023 · 73fb7c9 · 73fb7c9
2 parents 3baa649 + 1c258f4
commit 73fb7c9
Showing 8 changed files with 209 additions and 76 deletions.
diff --git a/Rules/Braille/UEB/UEB_Rules.yaml b/Rules/Braille/UEB/UEB_Rules.yaml
@@ -69,7 +69,7 @@
    replace: 
    - t: "1⠷"
    - x: "*[1]"
-   - t: "⠨⠌"
+   - t: "1⠨⠌"
    - x: "*[2]"
    - t: "1⠾"
 
@@ -90,7 +90,7 @@
   match: "IsBracketed(., '(', ')') and *[2][self::m:mtable][count(*)=2 and count(*[1])=1] and contains(@intent, 'binomial(')"
   replace:
      - x: "*[1]"
-     - x: "DEBUG(*[2]/*[1]/*[1]/*[1])"
+     - x: "*[2]/*[1]/*[1]/*[1]"
      - t: "⠰⠻"
      - x: "*[2]/*[2]/*[1]/*[1]"
      - x: "*[3]"
@@ -179,6 +179,20 @@
    match: "."
    replace: [x: "*"]
 
+
+-
+   # operator omission
+   name: operator-omission
+   tag: mo
+   # normally(?) this is an omission, but the case 6'2" causes problems due to WIRIS
+   # in that case, it doesn't put the ft/in in a superscript and thinks the space is an operator
+   # canonicalization moves them into <msup> with base <mn>, so this shouldn't be an omission for that case
+   match: "translate(., '\u00A0', '')='' and
+           not( preceding-sibling::*[1][self::m:msup and *[2][self::m:mo]] and
+                following-sibling::*[1][self::m:msup and *[2][self::m:mo]] )"
+   replace:
+   - t: "⠬"   # empty space for omission
+
 -
    name: default
    tag: mo
@@ -241,15 +255,13 @@
    replace:
    - x: "BrailleChars(., 'UEB')"
 
-
 -
    # FIX: need to deal with all caps
    name: default
    tag: [mi, mtext]
    match: "."
    replace:
-      - x: "BrailleChars(., 'UEB')"
-
+   - x: "BrailleChars(., 'UEB')"
 
 -
    name: default

diff --git a/Rules/Braille/UEB/unicode.yaml b/Rules/Braille/UEB/unicode.yaml
@@ -264,6 +264,8 @@
           then: [t: "N⠐"]
         - else_if: "following-sibling::*[1][@class='MathML-unit' or BaseNode(.)[@class='MathML-unit']]"
           then: [t: "𝐖"]
+        - else_if: "@data-added='missing-content' or @data-changed='empty_content'"
+          then: [t: "⠬"]       # omission
           else: [t: "W"]
  - ",":                        # 0x2c (Comma)
      - test:

diff --git a/Rules/Languages/en/navigate.yaml b/Rules/Languages/en/navigate.yaml
@@ -254,15 +254,20 @@
       then: [t: "zoom in", pause: "long"]                 # phrase('zoom in')
   - set_variables: [NavNode: "*[2]/*[1]/@id"]
 
-# special case of zooming into a table -- move to the first row
+# special case of zooming into a table -- move to the first row (if only one row, first column)
 - name: zoom-in-table
   tag: mtable
   match: "$NavCommand = 'ZoomIn'"
   replace:
   - test:
       if: "$MatchCounter = 0 and $NavVerbosity = 'Verbose'"
       then: [t: "zoom in", pause: "long"]               # phrase('zoom in')
-  - set_variables: [NavNode: "*[1]/@id"]
+  - test:
+      if: "count(*)=1"
+      then: 
+      - set_variables: [NavNode: "*[1]/*[1]/@id"]
+      else:
+      - set_variables: [NavNode: "*[1]/@id"]
 
 - name: zoom-in-mrow-in-math
   # Moving to first or last is meaningless the 'math' has only an 'mrow' inside -- dig inside and do it again 

diff --git a/src/braille.rs b/src/braille.rs
@@ -574,31 +574,20 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
         if is_grade2_string_ok(&grade2) {
             return grade2;
         } else {
-            let grade1_word = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Word);
-            debug!("Word mode:    '{}'", grade1_word);
-
             // BANA says use g1 word mode if spaces are present, but that's not what their examples do
             // A conversation with Ms. DeAndrea from BANA said that they mean use passage mode if ≥3 "segments" (≥2 blanks)
-            // However, it is pointless to go into passage mode if the internal string is the same as word mode
-            let mut grade1_passage = "".to_string();
-            let mut n_blanks = 0;
-            if grade1_word.chars().any(|ch| {
-                if ch == 'W' {
-                    n_blanks += 1;
-                }
-                n_blanks == 2
-            }) {
-                grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
-                // debug!("Passage mode: '{}'", &grade1_passage);
-            }
-            if grade1_passage.is_empty() || grade1_passage == grade1_word {
-                return "⠰⠰".to_string() + &grade1_word;
+            // The G1 Word mode might not be at the start (iceb.rs:omission_3_6_7)
+            let grade1_word = try_grade1_word_mode(raw_braille);
+            debug!("Word mode:    '{}'", grade1_word);
+            if !grade1_word.is_empty() {
+                return grade1_word;
             } else {
+                let grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
                 return "⠰⠰⠰".to_string() + &grade1_passage + "⠰⠄";
             }
         }
 
-        /// Return true if the BANA guidelines say it is ok to start with grade 2
+        /// Return true if the BANA or ICEB guidelines say it is ok to start with grade 2
         fn is_grade2_string_ok(grade2_braille: &str) -> bool {
             // BANA says use grade 2 if there is not more than one grade one symbol or single letter standing alone.
             // The exact quote from their guidance:
@@ -607,12 +596,14 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
             //    begin the expression with a grade 1 word indicator
             // Note: I modified this slightly to exclude the cap indicator in the count. That allows three more ICEB rule to pass and seems
             //    like it is a reasonable thing to do.
+            // Another modification is allow a single G1 indicator to occur after whitespace later on
+            //    because ICEB examples show it and it seems better than going to passage mode if it is the only G1 indicator
 
             // Because of the 'L's which go away, we have to put a little more work into finding the first three chars
             let chars = grade2_braille.chars().collect::<Vec<char>>();
             let mut n_real_chars = 0;  // actually number of chars
             let mut found_g1 = false;
-            let mut i = 0;      // chars starts on the 4th char
+            let mut i = 0;
             while i < chars.len() {
                 let ch = chars[i];
                 if ch == '1' && !is_forced_grade1(&chars, i) {
@@ -623,22 +614,29 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
                 } else if !"𝐶CLobc".contains(ch) {
                     if n_real_chars == 2 {
                         i += 1;
-                        break;      // this is the third real char
+                        break;              // this is the third real char
                     };
                     n_real_chars += 1;
                 }
                 i += 1
             }
 
-            // if we find another g1 that isn't forced and isn't standing alone, we are done
+            // if we find *another* g1 that isn't forced and isn't standing alone, we are done
+            // I've added a 'follows whitespace' clause for test iceb.rs:omission_3_6_2 to the standing alone rule
             // we only allow one standing alone example -- not sure if BANA guidance has this limit, but GTM 11_5_5_3 seems better with it
+            // Same for GTM 1_7_3_1 (passage mode is mentioned also)
             let mut is_standing_alone_already_encountered = false;
+            let mut is_after_whitespace = false;
             while i < chars.len() {
                 let ch = chars[i];
-                if ch == '1' && !is_forced_grade1(&chars, i) {
-                    if !is_single_letter_on_right(&chars, i) || is_standing_alone_already_encountered {
+                if ch == 'W' {
+                    is_after_whitespace = true;
+                } else if ch == '1' && !is_forced_grade1(&chars, i) {
+                    if is_standing_alone_already_encountered ||
+                       ((found_g1 || !is_after_whitespace) && !is_single_letter_on_right(&chars, i)) {
                         return false;
                     }
+                    found_g1 = true;
                     is_standing_alone_already_encountered = true; 
                 }
                 i += 1;
@@ -691,6 +689,31 @@ fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> Str
             }
             return true;
         }
+
+        fn try_grade1_word_mode(raw_braille: &str) -> String {
+            // this isn't quite right, but pretty close -- try splitting at 'W' (words)
+            // only one of the parts can be in word mode and none of the others can have '1' unless forced
+            let mut g1_words = Vec::default();
+            let mut found_word_mode = false;
+            for raw_word in raw_braille.split('W') {
+                let word = remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade2, UEB_Duration::Symbol);
+                debug!("try_grade1_word_mode: word='{}'", word);
+                let word_chars = word.chars().collect::<Vec<char>>();
+                let needs_word_mode = word_chars.iter().enumerate()
+                    .any(|(i, &ch) | ch == '1' && !is_forced_grade1(&word_chars, i));
+                if needs_word_mode {
+                    if found_word_mode {
+                        return "".to_string();
+                    }
+                    found_word_mode = true;
+                    g1_words.push("⠰⠰".to_string() + &remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade1, UEB_Duration::Word)
+                    );
+                } else {
+                    g1_words.push(word);
+                }
+            }
+            return if found_word_mode {g1_words.join("W")} else {"".to_string()};
+        }
     }
 }
 
@@ -765,7 +788,7 @@ fn capitals_to_word_mode(braille: &str) -> String {
             if find_next_char(&chars[next_non_cap..], 'C').is_some() { // next letter sequence "C..."
                 if is_next_char_start_of_section_12_modifier(&chars[next_non_cap+1..]) {
                     // to me this is tricky -- section 12 modifiers apply to the previous item
-                    // the last clause of the "item" def is the previous "individual symbol" which ICEB 2.1 say is:
+                    // the last clause of the "item" def is the previous adividual symbol" which ICEB 2.1 say is:
                     //   braille sign: one or more consecutive braille characters comprising a unit,
                     //     consisting of a root on its own or a root preceded by one or more
                     //     prefixes (also referred to as braille symbol)
@@ -1217,43 +1240,78 @@ fn stands_alone(chars: &[char], i: usize) -> (bool, &[char], usize) {
     }
 }
 
+
 /// Return a modified result if chars can be contracted.
 /// Otherwise, the original string is returned
 fn handle_contractions(chars: &[char], mut result: String) -> String {
     struct Replacement {
-        pattern: &'static str,
+        pattern: String,
         replacement: &'static str
     }
 
-    // It would be much better from an extensibility point of view to read the table in from a file
-    // FIX: this would be much easier to read/maintain if ASCII braille were used
-    // FIX:   (without the "L"s) and the CONTRACTIONS table built as a lazy static
-    static CONTRACTIONS: &[Replacement] = &[
-        Replacement{ pattern: "L⠁L⠝L⠙", replacement: "L⠯" },           // and
-        Replacement{ pattern: "L⠋L⠕L⠗", replacement: "L⠿" },           // for
-        Replacement{ pattern: "L⠕L⠋", replacement: "L⠷" },             // of
-        Replacement{ pattern: "L⠞L⠓L⠑", replacement: "L⠮" },           // the
-        Replacement{ pattern: "L⠺L⠊L⠞L⠓", replacement: "L⠾" },         // with
-        Replacement{ pattern: "L⠉L⠓", replacement: "L⠡" },              // ch
-        Replacement{ pattern: "L⠊L⠝", replacement: "L⠔" },              // in
-
-        // cc -- don't match if after/before a cap letter -- no/can't use negative pattern (?!...) in regex package
-        // figure this out -- also applies to ea, bb, ff, and gg (not that they matter)
-        // cc may be important for "arccos", but RUEB doesn't apply it to "arccosine", so maybe not
-        // Replacement{ pattern: "L⠉L⠉", replacement: "L⠒" },              // cc -- don't match if after/before a cap letter
-
-
-        Replacement{ pattern: "L⠎L⠓", replacement: "L⠩" },              // sh
-        Replacement{ pattern: "L⠁L⠗", replacement: "L⠜" },              // ar
-        Replacement{ pattern: "L⠑L⠗", replacement: "L⠻" },              // er
-        Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞", replacement: "${s}L⠰L⠞" }, // ment
-        Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝", replacement: "${s}L⠰L⠝" } ,// tion
-        Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)", replacement: "${s}L⠂${e}" },  // ea
+    const ASCII_TO_UNICODE: &[char] = &[
+        '⠀', '⠮', '⠐', '⠼', '⠫', '⠩', '⠯', '⠄', '⠷', '⠾', '⠡', '⠬', '⠠', '⠤', '⠨', '⠌',
+        '⠴', '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠱', '⠰', '⠣', '⠿', '⠜', '⠹',
+        '⠈', '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', '⠝', '⠕',
+        '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', '⠪', '⠳', '⠻', '⠘', '⠸',
     ];
 
+    fn to_unicode_braille(ascii: &str) -> String {
+        let mut unicode = String::with_capacity(4*ascii.len());   // 'L' + 3 bytes for braille char
+        for ch in ascii.as_bytes() {
+            unicode.push('L');
+            unicode.push(ASCII_TO_UNICODE[(ch.to_ascii_uppercase() - 32) as usize])
+        }
+        return unicode;
+    }
+
+    // It would be much better from an extensibility point of view to read the table in from a file
     lazy_static! {
-        static ref CONTRACTION_PATTERNS: RegexSet = init_patterns(CONTRACTIONS);
-        static ref CONTRACTION_REGEX: Vec<Regex> = init_regex(CONTRACTIONS);
+        static ref CONTRACTIONS: Vec<Replacement> = vec![
+            // 10.3: Strong contractions
+            Replacement{ pattern: to_unicode_braille("and"), replacement: "L⠯"},
+            Replacement{ pattern: to_unicode_braille("for"), replacement: "L⠿"},
+            Replacement{ pattern: to_unicode_braille("of"), replacement: "L⠷"},
+            Replacement{ pattern: to_unicode_braille("the"), replacement: "L⠮"},
+            Replacement{ pattern: to_unicode_braille("with"), replacement: "L⠾"},
+
+            // 10.8: final-letter groupsigns (this need to preceed 'en' and any other shorter contraction)
+            Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞".to_string(), replacement: "${s}L⠰L⠞" }, // ment
+            Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝".to_string(), replacement: "${s}L⠰L⠝" } ,// tion
+
+            // 10.4: Strong groupsigns
+            Replacement{ pattern: to_unicode_braille("ch"), replacement: "L⠡"},
+            Replacement{ pattern: to_unicode_braille("gh"), replacement: "L⠣"},
+            Replacement{ pattern: to_unicode_braille("sh"), replacement: "L⠩"},
+            Replacement{ pattern: to_unicode_braille("th"), replacement: "L⠹"},
+            Replacement{ pattern: to_unicode_braille("wh"), replacement: "L⠱"},
+            Replacement{ pattern: to_unicode_braille("ed"), replacement: "L⠫"},
+            Replacement{ pattern: to_unicode_braille("er"), replacement: "L⠻"},
+            Replacement{ pattern: to_unicode_braille("ou"), replacement: "L⠳"},
+            Replacement{ pattern: to_unicode_braille("ow"), replacement: "L⠪"},
+            Replacement{ pattern: to_unicode_braille("st"), replacement: "L⠌"},
+            Replacement{ pattern: "(?P<s>L.)L⠊L⠝L⠛".to_string(), replacement: "${s}L⠬" },  // 'ing', not at start
+            Replacement{ pattern: to_unicode_braille("ar"), replacement: "L⠜"},
+
+            // 10.6.5: Lower groupsigns preceeded and followed by letters
+            // FIX: don't match if after/before a cap letter -- can't use negative pattern (?!...) in regex package
+            // Note: removed cc because "arccos" shouldn't be contracted (10.11.1), but there is no way to know about compound words
+            // Add it back after implementing a lookup dictionary of exceptions
+            Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)".to_string(), replacement: "${s}L⠂${e}" },  // ea
+            Replacement{ pattern: "(?P<s>L.)L⠃L⠃(?P<e>L.)".to_string(), replacement: "${s}L⠆${e}" },  // bb
+            // Replacement{ pattern: "(?P<s>L.)L⠉L⠉(?P<e>L.)".to_string(), replacement: "${s}L⠒${e}" },  // cc
+            Replacement{ pattern: "(?P<s>L.)L⠋L⠋(?P<e>L.)".to_string(), replacement: "${s}L⠖${e}" },  // ff
+            Replacement{ pattern: "(?P<s>L.)L⠛L⠛(?P<e>L.)".to_string(), replacement: "${s}L⠶${e}" },  // gg
+
+            // 10.6.8: Lower groupsigns ("in" also 10.5.4 lower wordsigns)
+            // FIX: these need restrictions about only applying when upper dots are present
+            Replacement{ pattern: to_unicode_braille("en"), replacement: "⠢"},
+            Replacement{ pattern: to_unicode_braille("in"), replacement: "⠔"},
+
+        ];
+
+        static ref CONTRACTION_PATTERNS: RegexSet = init_patterns(&CONTRACTIONS);
+        static ref CONTRACTION_REGEX: Vec<Regex> = init_regex(&CONTRACTIONS);
     }
 
     let mut chars_as_str = chars.iter().collect::<String>();
@@ -1272,17 +1330,17 @@ fn handle_contractions(chars: &[char], mut result: String) -> String {
 
 
     fn init_patterns(contractions: &[Replacement]) -> RegexSet {
-        let mut vec = Vec::with_capacity(contractions.len());
+        let mut vec: Vec<&str> = Vec::with_capacity(contractions.len());
         for contraction in contractions {
-            vec.push(contraction.pattern);
+            vec.push(&contraction.pattern);
         }
         return RegexSet::new(&vec).unwrap();
     }
 
     fn init_regex(contractions: &[Replacement]) -> Vec<Regex> {
         let mut vec = Vec::with_capacity(contractions.len());
         for contraction in contractions {
-            vec.push(Regex::new(contraction.pattern).unwrap());
+            vec.push(Regex::new(&contraction.pattern).unwrap());
         }
         return vec;
     }

diff --git a/src/canonicalize.rs b/src/canonicalize.rs
@@ -809,6 +809,7 @@ impl CanonicalizeContext {
 				if !text.is_empty() && IS_WHITESPACE.is_match(text) {
 					// can't throw it out because it is needed by braille -- change to what it really is
 					set_mathml_name(mathml, "mtext");
+					mathml.set_attribute_value(CHANGED_ATTR, "was-mo");
 					return Some(mathml);
 				} else {
 					match text {
@@ -3612,7 +3613,13 @@ impl CanonicalizeContext {
 							} else {
 								OperatorPair{ ch: "\u{2062}", op: &IMPLIED_TIMES }
 							};
-
+					if let Some(attr_val) = base_of_child.attribute_value(CHANGED_ATTR) {
+						if attr_val == "was-mo" {
+							// it really should be an operator
+							base_of_child.remove_attribute(CHANGED_ATTR);
+							set_mathml_name(base_of_child, "mo");
+						}
+					}
 					if name(&base_of_child) == "mo" {
 						current_op.ch = as_text(base_of_child);
 						// debug!("  Found whitespace op '{}'/{}", show_invisible_op_char(current_op.ch), current_op.op.priority);

diff --git a/src/main.rs b/src/main.rs
@@ -170,7 +170,7 @@ fn main() {
   //     </mrow>
   //   </math>";
   let expr=r#"
-  <math><mn>0</mn><mo>,</mo><mn>1</mn> </math>
+  <math><mrow><mn>5</mn><mo>=</mo><msqrt><mrow/></msqrt></mrow></math>
              "#;
 
 //   let expr = "
@@ -213,14 +213,14 @@ fn main() {
     panic!("Error: exiting -- {}", errors_to_string(&e));
   };
 
-  match get_spoken_text() {
-    Ok(speech) => info!("Computed speech string:\n   '{}'", speech),
-    Err(e) => panic!("{}", errors_to_string(&e)),
-  }
-  info!("SpeechStyle: {:?}", get_preference("SpeechStyle".to_string()).unwrap());
+  // match get_spoken_text() {
+  //   Ok(speech) => info!("Computed speech string:\n   '{}'", speech),
+  //   Err(e) => panic!("{}", errors_to_string(&e)),
+  // }
+  // info!("SpeechStyle: {:?}", get_preference("SpeechStyle".to_string()).unwrap());
 
 
-  set_preference("BrailleCode".to_string(), "CMU".to_string()).unwrap();
+  set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
   match get_braille("".to_string()) {
     Ok(braille) => info!("Computed braille string:\n   '{}'", braille),
     Err(e) => panic!("{}", errors_to_string(&e)),