From 0bf527d254241595aa48d6c1e4060c0c73920740 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Tue, 21 May 2024 16:33:41 -0400 Subject: [PATCH] [fontbe] Get off patched version of icu_properties This wasn't too hard, although it does require us to duplicate a bit of code in a few places and otherwise slightly complicate some impls. --- Cargo.toml | 6 -- fontbe/src/features/kern.rs | 13 +---- fontbe/src/features/properties.rs | 94 ++++++++++++++----------------- 3 files changed, 44 insertions(+), 69 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c02e10b7..a16f2a7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,9 +52,3 @@ members = [ "fea-lsp", "otl-normalizer", ] - -# remove this when icu_properties 1.5 is released: -# https://github.com/unicode-org/icu4x/issues?q=is%3Aopen+is%3Aissue+milestone%3A%221.5+Blocking+⟨P1⟩%22 -[patch.crates-io] -icu_properties = { version = "1.4", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" } -tinystr = { version = "0.7.5", git = "https://github.com/unicode-org/icu4x.git", rev = "728eb44" } diff --git a/fontbe/src/features/kern.rs b/fontbe/src/features/kern.rs index 45f67039..9d9e1f49 100644 --- a/fontbe/src/features/kern.rs +++ b/fontbe/src/features/kern.rs @@ -598,7 +598,7 @@ struct KernSplitContext { /// map of all mark glyphs + whether they are spacing or not mark_glyphs: HashMap, glyph_scripts: HashMap>, - bidi_glyphs: HashMap>, + bidi_glyphs: BTreeMap>, opts: KernSplitOptions, dflt_scripts: HashSet, common_scripts: HashSet, @@ -672,7 +672,7 @@ impl KernSplitContext { ) -> BTreeMap, Vec>> { let mut lookups_by_script = BTreeMap::new(); let kerning_per_script = self.split_kerns(pairs); - let mut bidi_buf = HashSet::new(); // we can reuse this for each pair + let mut bidi_buf = BTreeSet::new(); // we can reuse this for each pair for (scripts, pairs) in kerning_per_script { let mut builder = PairPosBuilder::default(); for mut pair in pairs { @@ -996,14 +996,7 @@ fn guess_font_scripts(ast: &ParseTree, glyphs: &impl CharMap) -> HashSet HashSet { glyphs .iter_glyphs() - .filter_map(|(_, codepoint)| { - let mut scripts = super::properties::unicode_script_extensions(codepoint); - // only if a codepoint has a single script do know it is supported - match (scripts.next(), scripts.next()) { - (Some(script), None) => Some(script), - _ => None, - } - }) + .filter_map(|(_, codepoint)| super::properties::single_script_for_codepoint(codepoint)) .collect() } diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs index a9bf7b2c..61b10dfc 100644 --- a/fontbe/src/features/properties.rs +++ b/fontbe/src/features/properties.rs @@ -1,12 +1,13 @@ //! Properties and constants related to unicode data use std::{ - collections::{HashMap, HashSet}, + collections::{BTreeMap, HashMap, HashSet}, hash::Hash, sync::Arc, }; use fontir::ir::Glyph; -use icu_properties::{script::ScriptWithExtensionsBorrowed, BidiClass, Script}; +use icu_properties::{BidiClass, Script}; +use tinystr::tinystr; use write_fonts::{ read::{tables::gsub::Gsub, ReadError}, types::{GlyphId, Tag}, @@ -16,16 +17,8 @@ use crate::features::ot_tags::{NEW_SCRIPTS, SCRIPT_ALIASES, SCRIPT_EXCEPTIONS_RE use super::ot_tags::{DFLT_SCRIPT, INDIC_SCRIPTS, NEW_SCRIPT_TAGS, SCRIPT_EXCEPTIONS, USE_SCRIPTS}; -// SAFETY: we can visually verify that these inputs contain only non-null ASCII bytes -// (this is the only way we can declare these as constants) -// TODO: remove this if https://github.com/unicode-org/icu4x/pull/4691 is merged/released -pub const COMMON_SCRIPT: UnicodeShortName = - unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zyyy") }; -pub const INHERITED_SCRIPT: UnicodeShortName = - unsafe { UnicodeShortName::from_bytes_unchecked(*b"Zinh") }; - -static SCRIPT_DATA: ScriptWithExtensionsBorrowed<'static> = - icu_properties::script::script_with_extensions(); +pub const COMMON_SCRIPT: UnicodeShortName = tinystr!(4, "Zyyy"); +pub const INHERITED_SCRIPT: UnicodeShortName = tinystr!(4, "Zinh"); /// The type used by icu4x for script names pub type UnicodeShortName = tinystr::TinyAsciiStr<4>; @@ -89,25 +82,16 @@ impl ScriptDirection { } } -/// Iterate over the unicode scripts + extensions for the provided codepoint -/// -/// This returns the scripts as shortnames, because that's what python does. -/// It would probably make more sense for us to use the Script type defined by -/// icu4x, but I want to get a more direct port working first. -pub(crate) fn unicode_script_extensions( - c: u32, -) -> impl Iterator + 'static { +/// Iff a codepoint belongs to a single script, return it. +pub(crate) fn single_script_for_codepoint(cp: u32) -> Option { let lookup = Script::enum_to_short_name_mapper(); - SCRIPT_DATA - .get_script_extensions_val(c) - .iter() - .map(move |script| { - lookup - .get(script) - // if we get a script it is by definition a 4-char ascii string, - // so this unwrap should never fail - .expect("names should be available for all defined scripts") - }) + let data = icu_properties::script::script_with_extensions().get_script_extensions_val(cp); + let mut scripts = data.iter(); + + match (scripts.next(), scripts.next()) { + (Some(script), None) => lookup.get(script), + _ => None, + } } // @@ -124,22 +108,24 @@ fn unicode_bidi_type(c: u32) -> Option { // equivalent to the 'classify' method in ufo2ft: // -fn classify( +fn classify( char_map: &CM, mut props_fn: F, gsub: Option<&Gsub>, -) -> Result>, ReadError> +) -> Result>, ReadError> where - T: Hash + Eq, - I: Iterator, - F: FnMut(u32) -> I, + T: Ord + Eq, + // instead of returning an iterator, pushes items into the provided buffer + F: FnMut(u32, &mut Vec), CM: CharMap, { - let mut sets = HashMap::new(); + let mut sets = BTreeMap::new(); let mut neutral_glyphs = HashSet::new(); + let mut buf = Vec::new(); for (gid, unicode_value) in char_map.iter_glyphs() { let mut has_props = false; - for prop in props_fn(unicode_value) { + props_fn(unicode_value, &mut buf); + for prop in buf.drain(..) { sets.entry(prop).or_insert(HashSet::new()).insert(gid); has_props = true; } @@ -169,23 +155,25 @@ pub(crate) fn scripts_by_glyph( gsub: Option<&Gsub>, ) -> Result>, ReadError> { let mut result = HashMap::new(); + let lookup = Script::enum_to_short_name_mapper(); for (script, glyphs) in classify( glyphs, - |cp| { - // we need to write this in such a way as to return a single concrete type; - // this is basically two branches, one or the other option is always `None` - let common = known_scripts.is_empty().then_some(COMMON_SCRIPT); - let other_branch = if known_scripts.is_empty() { - None + |cp, buf| { + if known_scripts.is_empty() { + buf.push(COMMON_SCRIPT); } else { - Some(unicode_script_extensions(cp).filter(|script| { - *script == COMMON_SCRIPT - || *script == INHERITED_SCRIPT - || known_scripts.contains(script) - })) - }; - - common.into_iter().chain(other_branch.into_iter().flatten()) + let data = + icu_properties::script::script_with_extensions().get_script_extensions_val(cp); + buf.extend( + data.iter() + .map(|s| lookup.get(s).unwrap()) + .filter(|script| { + *script == COMMON_SCRIPT + || *script == INHERITED_SCRIPT + || known_scripts.contains(script) + }), + ); + } }, gsub, )? { @@ -200,10 +188,10 @@ pub(crate) fn scripts_by_glyph( pub(crate) fn glyphs_by_bidi_class( glyphs: &impl CharMap, gsub: Option<&Gsub>, -) -> Result>, ReadError> { +) -> Result>, ReadError> { classify( glyphs, - |codepoint| unicode_bidi_type(codepoint).into_iter(), + |codepoint, buf| buf.extend(unicode_bidi_type(codepoint)), gsub, ) }