From 0bf527d254241595aa48d6c1e4060c0c73920740 Mon Sep 17 00:00:00 2001
From: Colin Rofls <colin@cmyr.net>
Date: Tue, 21 May 2024 16:33:41 -0400
Subject: [PATCH] [fontbe] Get off patched version of icu_properties

This wasn't too hard, although it does require us to duplicate a bit of
code in a few places and otherwise slightly complicate some impls.
---
 Cargo.toml                        |  6 --
 fontbe/src/features/kern.rs       | 13 +----
 fontbe/src/features/properties.rs | 94 ++++++++++++++-----------------
 3 files changed, 44 insertions(+), 69 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index c02e10b7..a16f2a7e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,9 +52,3 @@ members = [
     "fea-lsp",
     "otl-normalizer",
 ]
-
-# remove this when icu_properties 1.5 is released:
-# https://github.com/unicode-org/icu4x/issues?q=is%3Aopen+is%3Aissue+milestone%3A%221.5+Blocking+⟨P1⟩%22
-[patch.crates-io]
-icu_properties = { version = "1.4", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" }
-tinystr = { version = "0.7.5", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" }
diff --git a/fontbe/src/features/kern.rs b/fontbe/src/features/kern.rs
index 45f67039..9d9e1f49 100644
--- a/fontbe/src/features/kern.rs
+++ b/fontbe/src/features/kern.rs
@@ -598,7 +598,7 @@ struct KernSplitContext {
     /// map of all mark glyphs + whether they are spacing or not
     mark_glyphs: HashMap<GlyphId, MarkSpacing>,
     glyph_scripts: HashMap<GlyphId, HashSet<UnicodeShortName>>,
-    bidi_glyphs: HashMap<BidiClass, HashSet<GlyphId>>,
+    bidi_glyphs: BTreeMap<BidiClass, HashSet<GlyphId>>,
     opts: KernSplitOptions,
     dflt_scripts: HashSet<UnicodeShortName>,
     common_scripts: HashSet<UnicodeShortName>,
@@ -672,7 +672,7 @@ impl KernSplitContext {
     ) -> BTreeMap<BTreeSet<UnicodeShortName>, Vec<PendingLookup<PairPosBuilder>>> {
         let mut lookups_by_script = BTreeMap::new();
         let kerning_per_script = self.split_kerns(pairs);
-        let mut bidi_buf = HashSet::new(); // we can reuse this for each pair
+        let mut bidi_buf = BTreeSet::new(); // we can reuse this for each pair
         for (scripts, pairs) in kerning_per_script {
             let mut builder = PairPosBuilder::default();
             for mut pair in pairs {
@@ -996,14 +996,7 @@ fn guess_font_scripts(ast: &ParseTree, glyphs: &impl CharMap) -> HashSet<Unicode
 fn scripts_for_chars(glyphs: &impl CharMap) -> HashSet<UnicodeShortName> {
     glyphs
         .iter_glyphs()
-        .filter_map(|(_, codepoint)| {
-            let mut scripts = super::properties::unicode_script_extensions(codepoint);
-            // only if a codepoint has a single script do know it is supported
-            match (scripts.next(), scripts.next()) {
-                (Some(script), None) => Some(script),
-                _ => None,
-            }
-        })
+        .filter_map(|(_, codepoint)| super::properties::single_script_for_codepoint(codepoint))
         .collect()
 }
 
diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs
index a9bf7b2c..61b10dfc 100644
--- a/fontbe/src/features/properties.rs
+++ b/fontbe/src/features/properties.rs
@@ -1,12 +1,13 @@
 //! Properties and constants related to unicode data
 use std::{
-    collections::{HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet},
     hash::Hash,
     sync::Arc,
 };
 
 use fontir::ir::Glyph;
-use icu_properties::{script::ScriptWithExtensionsBorrowed, BidiClass, Script};
+use icu_properties::{BidiClass, Script};
+use tinystr::tinystr;
 use write_fonts::{
     read::{tables::gsub::Gsub, ReadError},
     types::{GlyphId, Tag},
@@ -16,16 +17,8 @@ use crate::features::ot_tags::{NEW_SCRIPTS, SCRIPT_ALIASES, SCRIPT_EXCEPTIONS_RE
 
 use super::ot_tags::{DFLT_SCRIPT, INDIC_SCRIPTS, NEW_SCRIPT_TAGS, SCRIPT_EXCEPTIONS, USE_SCRIPTS};
 
-// SAFETY: we can visually verify that these inputs contain only non-null ASCII bytes
-// (this is the only way we can declare these as constants)
-// TODO: remove this if https://github.com/unicode-org/icu4x/pull/4691 is merged/released
-pub const COMMON_SCRIPT: UnicodeShortName =
-    unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zyyy") };
-pub const INHERITED_SCRIPT: UnicodeShortName =
-    unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zinh") };
-
-static SCRIPT_DATA: ScriptWithExtensionsBorrowed<'static> =
-    icu_properties::script::script_with_extensions();
+pub const COMMON_SCRIPT: UnicodeShortName = tinystr!(4, "Zyyy");
+pub const INHERITED_SCRIPT: UnicodeShortName = tinystr!(4, "Zinh");
 
 /// The type used by icu4x for script names
 pub type UnicodeShortName = tinystr::TinyAsciiStr<4>;
@@ -89,25 +82,16 @@ impl ScriptDirection {
     }
 }
 
-/// Iterate over the unicode scripts + extensions for the provided codepoint
-///
-/// This returns the scripts as shortnames, because that's what python does.
-/// It would probably make more sense for us to use the Script type defined by
-/// icu4x, but I want to get a more direct port working first.
-pub(crate) fn unicode_script_extensions(
-    c: u32,
-) -> impl Iterator<Item = UnicodeShortName> + 'static {
+/// Iff a codepoint belongs to a single script, return it.
+pub(crate) fn single_script_for_codepoint(cp: u32) -> Option<UnicodeShortName> {
     let lookup = Script::enum_to_short_name_mapper();
-    SCRIPT_DATA
-        .get_script_extensions_val(c)
-        .iter()
-        .map(move |script| {
-            lookup
-                .get(script)
-                // if we get a script it is by definition a 4-char ascii string,
-                // so this unwrap should never fail
-                .expect("names should be available for all defined scripts")
-        })
+    let data = icu_properties::script::script_with_extensions().get_script_extensions_val(cp);
+    let mut scripts = data.iter();
+
+    match (scripts.next(), scripts.next()) {
+        (Some(script), None) => lookup.get(script),
+        _ => None,
+    }
 }
 
 // <https://github.com/googlefonts/ufo2ft/blob/f6b4f42460b340c/Lib/ufo2ft/featureWriters/kernFeatureWriter.py#L49>
@@ -124,22 +108,24 @@ fn unicode_bidi_type(c: u32) -> Option<BidiClass> {
 
 // equivalent to the 'classify' method in ufo2ft:
 // <https://github.com/googlefonts/ufo2ft/blob/cea60d71dfcf0b1c0f/Lib/ufo2ft/util.py#L287>
-fn classify<T, F, I, CM>(
+fn classify<T, F, CM>(
     char_map: &CM,
     mut props_fn: F,
     gsub: Option<&Gsub>,
-) -> Result<HashMap<T, HashSet<GlyphId>>, ReadError>
+) -> Result<BTreeMap<T, HashSet<GlyphId>>, ReadError>
 where
-    T: Hash + Eq,
-    I: Iterator<Item = T>,
-    F: FnMut(u32) -> I,
+    T: Ord + Eq,
+    // instead of returning an iterator, pushes items into the provided buffer
+    F: FnMut(u32, &mut Vec<T>),
     CM: CharMap,
 {
-    let mut sets = HashMap::new();
+    let mut sets = BTreeMap::new();
     let mut neutral_glyphs = HashSet::new();
+    let mut buf = Vec::new();
     for (gid, unicode_value) in char_map.iter_glyphs() {
         let mut has_props = false;
-        for prop in props_fn(unicode_value) {
+        props_fn(unicode_value, &mut buf);
+        for prop in buf.drain(..) {
             sets.entry(prop).or_insert(HashSet::new()).insert(gid);
             has_props = true;
         }
@@ -169,23 +155,25 @@ pub(crate) fn scripts_by_glyph(
     gsub: Option<&Gsub>,
 ) -> Result<HashMap<GlyphId, HashSet<UnicodeShortName>>, ReadError> {
     let mut result = HashMap::new();
+    let lookup = Script::enum_to_short_name_mapper();
     for (script, glyphs) in classify(
         glyphs,
-        |cp| {
-            // we need to write this in such a way as to return a single concrete type;
-            // this is basically two branches, one or the other option is always `None`
-            let common = known_scripts.is_empty().then_some(COMMON_SCRIPT);
-            let other_branch = if known_scripts.is_empty() {
-                None
+        |cp, buf| {
+            if known_scripts.is_empty() {
+                buf.push(COMMON_SCRIPT);
             } else {
-                Some(unicode_script_extensions(cp).filter(|script| {
-                    *script == COMMON_SCRIPT
-                        || *script == INHERITED_SCRIPT
-                        || known_scripts.contains(script)
-                }))
-            };
-
-            common.into_iter().chain(other_branch.into_iter().flatten())
+                let data =
+                    icu_properties::script::script_with_extensions().get_script_extensions_val(cp);
+                buf.extend(
+                    data.iter()
+                        .map(|s| lookup.get(s).unwrap())
+                        .filter(|script| {
+                            *script == COMMON_SCRIPT
+                                || *script == INHERITED_SCRIPT
+                                || known_scripts.contains(script)
+                        }),
+                );
+            }
         },
         gsub,
     )? {
@@ -200,10 +188,10 @@ pub(crate) fn scripts_by_glyph(
 pub(crate) fn glyphs_by_bidi_class(
     glyphs: &impl CharMap,
     gsub: Option<&Gsub>,
-) -> Result<HashMap<BidiClass, HashSet<GlyphId>>, ReadError> {
+) -> Result<BTreeMap<BidiClass, HashSet<GlyphId>>, ReadError> {
     classify(
         glyphs,
-        |codepoint| unicode_bidi_type(codepoint).into_iter(),
+        |codepoint, buf| buf.extend(unicode_bidi_type(codepoint)),
         gsub,
     )
 }