From c80dcd2994d2974f02346be5de72c945d14c4563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20D=C3=A9trez?= Date: Fri, 26 Jul 2024 16:35:42 +0200 Subject: [PATCH] Generate our own unicode block constants Add a script to generate a custom unicode_block module with constants representing unicode blocks. This replaces the blocks from unic-ucd-blocks. Also refactor CharacterType::Range to use a RangeInclusive so we can also remove the dependency on unic-char-range. --- CONTRIBUTING.md | 4 + Cargo.lock | 13 - Cargo.toml | 2 - README.md | 1 - UNIDATA/Blocks.txt | 364 ++++++++++++++++++ generate-unicode-blocks-consts | 60 +++ src/config.rs | 22 +- src/main.rs | 6 +- src/rules.rs | 10 +- src/unicode_blocks.rs | 657 +++++++++++++++++++++++++++++++++ 10 files changed, 1098 insertions(+), 41 deletions(-) create mode 100644 UNIDATA/Blocks.txt create mode 100755 generate-unicode-blocks-consts create mode 100644 src/unicode_blocks.rs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 55c2775..e7e4fbe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,3 +15,7 @@ issues are raised. Please follow the guidelines at https://cbea.ms/git-commit/ for git commit message style. There is a CI job that will enforce this. + +## Updating unicode blocks + +Use the `generate-unicode-blocks-consts` script to update character blocks constants from Unicode Consortium data: diff --git a/Cargo.lock b/Cargo.lock index 7092fce..1d4e35e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -813,17 +813,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" -[[package]] -name = "unic-ucd-block" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b2a16f2d7ecd25325a1053ca5a66e7fa1b68911a65c5e97f8d2e1b236b6f1d7" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - [[package]] name = "unic-ucd-hangul" version = "0.9.0" @@ -886,8 +875,6 @@ dependencies = [ "tree-sitter-python", "tree-sitter-rust", "trycmd", - "unic-char-range", - "unic-ucd-block", "unic-ucd-name", "walkdir", ] diff --git a/Cargo.toml b/Cargo.toml index 41d3660..b886adc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,8 +22,6 @@ tree-sitter = "0.22.6" tree-sitter-javascript = "0.21.2" tree-sitter-python = "0.21.0" unic-ucd-name = "0.9.0" -unic-ucd-block = "0.9.0" -unic-char-range = "0.9.0" toml = "0.8.14" serde = { version = "1.0.203", features = ["derive"] } walkdir = "2.5.0" diff --git a/README.md b/README.md index d4c700c..7502b4f 100644 --- a/README.md +++ b/README.md @@ -99,4 +99,3 @@ default = { [language.python] paths = ["./build", "run-tests", "*.py"] ``` - diff --git a/UNIDATA/Blocks.txt b/UNIDATA/Blocks.txt new file mode 100644 index 0000000..8fa3eaa --- /dev/null +++ b/UNIDATA/Blocks.txt @@ -0,0 +1,364 @@ +# Blocks-15.1.0.txt +# Date: 2023-07-28, 15:47:20 GMT +# © 2023 Unicode®, Inc. +# For terms of use, see https://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ +# +# Format: +# Start Code..End Code; Block Name + +# ================================================ + +# Note: When comparing block names, casing, whitespace, hyphens, +# and underbars are ignored. +# For example, "Latin Extended-A" and "latin extended a" are equivalent. +# For more information on the comparison of property values, +# see UAX #44: https://www.unicode.org/reports/tr44/ +# +# All block ranges start with a value where (cp MOD 16) = 0, +# and end with a value where (cp MOD 16) = 15. In other words, +# the last hexadecimal digit of the start of range is ...0 +# and the last hexadecimal digit of the end of range is ...F. +# This constraint on block ranges guarantees that allocations +# are done in terms of whole columns, and that code chart display +# never involves splitting columns in the charts. +# +# All code points not explicitly listed for Block +# have the value No_Block. + +# Property: Block +# +# @missing: 0000..10FFFF; No_Block + +0000..007F; Basic Latin +0080..00FF; Latin-1 Supplement +0100..017F; Latin Extended-A +0180..024F; Latin Extended-B +0250..02AF; IPA Extensions +02B0..02FF; Spacing Modifier Letters +0300..036F; Combining Diacritical Marks +0370..03FF; Greek and Coptic +0400..04FF; Cyrillic +0500..052F; Cyrillic Supplement +0530..058F; Armenian +0590..05FF; Hebrew +0600..06FF; Arabic +0700..074F; Syriac +0750..077F; Arabic Supplement +0780..07BF; Thaana +07C0..07FF; NKo +0800..083F; Samaritan +0840..085F; Mandaic +0860..086F; Syriac Supplement +0870..089F; Arabic Extended-B +08A0..08FF; Arabic Extended-A +0900..097F; Devanagari +0980..09FF; Bengali +0A00..0A7F; Gurmukhi +0A80..0AFF; Gujarati +0B00..0B7F; Oriya +0B80..0BFF; Tamil +0C00..0C7F; Telugu +0C80..0CFF; Kannada +0D00..0D7F; Malayalam +0D80..0DFF; Sinhala +0E00..0E7F; Thai +0E80..0EFF; Lao +0F00..0FFF; Tibetan +1000..109F; Myanmar +10A0..10FF; Georgian +1100..11FF; Hangul Jamo +1200..137F; Ethiopic +1380..139F; Ethiopic Supplement +13A0..13FF; Cherokee +1400..167F; Unified Canadian Aboriginal Syllabics +1680..169F; Ogham +16A0..16FF; Runic +1700..171F; Tagalog +1720..173F; Hanunoo +1740..175F; Buhid +1760..177F; Tagbanwa +1780..17FF; Khmer +1800..18AF; Mongolian +18B0..18FF; Unified Canadian Aboriginal Syllabics Extended +1900..194F; Limbu +1950..197F; Tai Le +1980..19DF; New Tai Lue +19E0..19FF; Khmer Symbols +1A00..1A1F; Buginese +1A20..1AAF; Tai Tham +1AB0..1AFF; Combining Diacritical Marks Extended +1B00..1B7F; Balinese +1B80..1BBF; Sundanese +1BC0..1BFF; Batak +1C00..1C4F; Lepcha +1C50..1C7F; Ol Chiki +1C80..1C8F; Cyrillic Extended-C +1C90..1CBF; Georgian Extended +1CC0..1CCF; Sundanese Supplement +1CD0..1CFF; Vedic Extensions +1D00..1D7F; Phonetic Extensions +1D80..1DBF; Phonetic Extensions Supplement +1DC0..1DFF; Combining Diacritical Marks Supplement +1E00..1EFF; Latin Extended Additional +1F00..1FFF; Greek Extended +2000..206F; General Punctuation +2070..209F; Superscripts and Subscripts +20A0..20CF; Currency Symbols +20D0..20FF; Combining Diacritical Marks for Symbols +2100..214F; Letterlike Symbols +2150..218F; Number Forms +2190..21FF; Arrows +2200..22FF; Mathematical Operators +2300..23FF; Miscellaneous Technical +2400..243F; Control Pictures +2440..245F; Optical Character Recognition +2460..24FF; Enclosed Alphanumerics +2500..257F; Box Drawing +2580..259F; Block Elements +25A0..25FF; Geometric Shapes +2600..26FF; Miscellaneous Symbols +2700..27BF; Dingbats +27C0..27EF; Miscellaneous Mathematical Symbols-A +27F0..27FF; Supplemental Arrows-A +2800..28FF; Braille Patterns +2900..297F; Supplemental Arrows-B +2980..29FF; Miscellaneous Mathematical Symbols-B +2A00..2AFF; Supplemental Mathematical Operators +2B00..2BFF; Miscellaneous Symbols and Arrows +2C00..2C5F; Glagolitic +2C60..2C7F; Latin Extended-C +2C80..2CFF; Coptic +2D00..2D2F; Georgian Supplement +2D30..2D7F; Tifinagh +2D80..2DDF; Ethiopic Extended +2DE0..2DFF; Cyrillic Extended-A +2E00..2E7F; Supplemental Punctuation +2E80..2EFF; CJK Radicals Supplement +2F00..2FDF; Kangxi Radicals +2FF0..2FFF; Ideographic Description Characters +3000..303F; CJK Symbols and Punctuation +3040..309F; Hiragana +30A0..30FF; Katakana +3100..312F; Bopomofo +3130..318F; Hangul Compatibility Jamo +3190..319F; Kanbun +31A0..31BF; Bopomofo Extended +31C0..31EF; CJK Strokes +31F0..31FF; Katakana Phonetic Extensions +3200..32FF; Enclosed CJK Letters and Months +3300..33FF; CJK Compatibility +3400..4DBF; CJK Unified Ideographs Extension A +4DC0..4DFF; Yijing Hexagram Symbols +4E00..9FFF; CJK Unified Ideographs +A000..A48F; Yi Syllables +A490..A4CF; Yi Radicals +A4D0..A4FF; Lisu +A500..A63F; Vai +A640..A69F; Cyrillic Extended-B +A6A0..A6FF; Bamum +A700..A71F; Modifier Tone Letters +A720..A7FF; Latin Extended-D +A800..A82F; Syloti Nagri +A830..A83F; Common Indic Number Forms +A840..A87F; Phags-pa +A880..A8DF; Saurashtra +A8E0..A8FF; Devanagari Extended +A900..A92F; Kayah Li +A930..A95F; Rejang +A960..A97F; Hangul Jamo Extended-A +A980..A9DF; Javanese +A9E0..A9FF; Myanmar Extended-B +AA00..AA5F; Cham +AA60..AA7F; Myanmar Extended-A +AA80..AADF; Tai Viet +AAE0..AAFF; Meetei Mayek Extensions +AB00..AB2F; Ethiopic Extended-A +AB30..AB6F; Latin Extended-E +AB70..ABBF; Cherokee Supplement +ABC0..ABFF; Meetei Mayek +AC00..D7AF; Hangul Syllables +D7B0..D7FF; Hangul Jamo Extended-B +D800..DB7F; High Surrogates +DB80..DBFF; High Private Use Surrogates +DC00..DFFF; Low Surrogates +E000..F8FF; Private Use Area +F900..FAFF; CJK Compatibility Ideographs +FB00..FB4F; Alphabetic Presentation Forms +FB50..FDFF; Arabic Presentation Forms-A +FE00..FE0F; Variation Selectors +FE10..FE1F; Vertical Forms +FE20..FE2F; Combining Half Marks +FE30..FE4F; CJK Compatibility Forms +FE50..FE6F; Small Form Variants +FE70..FEFF; Arabic Presentation Forms-B +FF00..FFEF; Halfwidth and Fullwidth Forms +FFF0..FFFF; Specials +10000..1007F; Linear B Syllabary +10080..100FF; Linear B Ideograms +10100..1013F; Aegean Numbers +10140..1018F; Ancient Greek Numbers +10190..101CF; Ancient Symbols +101D0..101FF; Phaistos Disc +10280..1029F; Lycian +102A0..102DF; Carian +102E0..102FF; Coptic Epact Numbers +10300..1032F; Old Italic +10330..1034F; Gothic +10350..1037F; Old Permic +10380..1039F; Ugaritic +103A0..103DF; Old Persian +10400..1044F; Deseret +10450..1047F; Shavian +10480..104AF; Osmanya +104B0..104FF; Osage +10500..1052F; Elbasan +10530..1056F; Caucasian Albanian +10570..105BF; Vithkuqi +10600..1077F; Linear A +10780..107BF; Latin Extended-F +10800..1083F; Cypriot Syllabary +10840..1085F; Imperial Aramaic +10860..1087F; Palmyrene +10880..108AF; Nabataean +108E0..108FF; Hatran +10900..1091F; Phoenician +10920..1093F; Lydian +10980..1099F; Meroitic Hieroglyphs +109A0..109FF; Meroitic Cursive +10A00..10A5F; Kharoshthi +10A60..10A7F; Old South Arabian +10A80..10A9F; Old North Arabian +10AC0..10AFF; Manichaean +10B00..10B3F; Avestan +10B40..10B5F; Inscriptional Parthian +10B60..10B7F; Inscriptional Pahlavi +10B80..10BAF; Psalter Pahlavi +10C00..10C4F; Old Turkic +10C80..10CFF; Old Hungarian +10D00..10D3F; Hanifi Rohingya +10E60..10E7F; Rumi Numeral Symbols +10E80..10EBF; Yezidi +10EC0..10EFF; Arabic Extended-C +10F00..10F2F; Old Sogdian +10F30..10F6F; Sogdian +10F70..10FAF; Old Uyghur +10FB0..10FDF; Chorasmian +10FE0..10FFF; Elymaic +11000..1107F; Brahmi +11080..110CF; Kaithi +110D0..110FF; Sora Sompeng +11100..1114F; Chakma +11150..1117F; Mahajani +11180..111DF; Sharada +111E0..111FF; Sinhala Archaic Numbers +11200..1124F; Khojki +11280..112AF; Multani +112B0..112FF; Khudawadi +11300..1137F; Grantha +11400..1147F; Newa +11480..114DF; Tirhuta +11580..115FF; Siddham +11600..1165F; Modi +11660..1167F; Mongolian Supplement +11680..116CF; Takri +11700..1174F; Ahom +11800..1184F; Dogra +118A0..118FF; Warang Citi +11900..1195F; Dives Akuru +119A0..119FF; Nandinagari +11A00..11A4F; Zanabazar Square +11A50..11AAF; Soyombo +11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A +11AC0..11AFF; Pau Cin Hau +11B00..11B5F; Devanagari Extended-A +11C00..11C6F; Bhaiksuki +11C70..11CBF; Marchen +11D00..11D5F; Masaram Gondi +11D60..11DAF; Gunjala Gondi +11EE0..11EFF; Makasar +11F00..11F5F; Kawi +11FB0..11FBF; Lisu Supplement +11FC0..11FFF; Tamil Supplement +12000..123FF; Cuneiform +12400..1247F; Cuneiform Numbers and Punctuation +12480..1254F; Early Dynastic Cuneiform +12F90..12FFF; Cypro-Minoan +13000..1342F; Egyptian Hieroglyphs +13430..1345F; Egyptian Hieroglyph Format Controls +14400..1467F; Anatolian Hieroglyphs +16800..16A3F; Bamum Supplement +16A40..16A6F; Mro +16A70..16ACF; Tangsa +16AD0..16AFF; Bassa Vah +16B00..16B8F; Pahawh Hmong +16E40..16E9F; Medefaidrin +16F00..16F9F; Miao +16FE0..16FFF; Ideographic Symbols and Punctuation +17000..187FF; Tangut +18800..18AFF; Tangut Components +18B00..18CFF; Khitan Small Script +18D00..18D7F; Tangut Supplement +1AFF0..1AFFF; Kana Extended-B +1B000..1B0FF; Kana Supplement +1B100..1B12F; Kana Extended-A +1B130..1B16F; Small Kana Extension +1B170..1B2FF; Nushu +1BC00..1BC9F; Duployan +1BCA0..1BCAF; Shorthand Format Controls +1CF00..1CFCF; Znamenny Musical Notation +1D000..1D0FF; Byzantine Musical Symbols +1D100..1D1FF; Musical Symbols +1D200..1D24F; Ancient Greek Musical Notation +1D2C0..1D2DF; Kaktovik Numerals +1D2E0..1D2FF; Mayan Numerals +1D300..1D35F; Tai Xuan Jing Symbols +1D360..1D37F; Counting Rod Numerals +1D400..1D7FF; Mathematical Alphanumeric Symbols +1D800..1DAAF; Sutton SignWriting +1DF00..1DFFF; Latin Extended-G +1E000..1E02F; Glagolitic Supplement +1E030..1E08F; Cyrillic Extended-D +1E100..1E14F; Nyiakeng Puachue Hmong +1E290..1E2BF; Toto +1E2C0..1E2FF; Wancho +1E4D0..1E4FF; Nag Mundari +1E7E0..1E7FF; Ethiopic Extended-B +1E800..1E8DF; Mende Kikakui +1E900..1E95F; Adlam +1EC70..1ECBF; Indic Siyaq Numbers +1ED00..1ED4F; Ottoman Siyaq Numbers +1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols +1F000..1F02F; Mahjong Tiles +1F030..1F09F; Domino Tiles +1F0A0..1F0FF; Playing Cards +1F100..1F1FF; Enclosed Alphanumeric Supplement +1F200..1F2FF; Enclosed Ideographic Supplement +1F300..1F5FF; Miscellaneous Symbols and Pictographs +1F600..1F64F; Emoticons +1F650..1F67F; Ornamental Dingbats +1F680..1F6FF; Transport and Map Symbols +1F700..1F77F; Alchemical Symbols +1F780..1F7FF; Geometric Shapes Extended +1F800..1F8FF; Supplemental Arrows-C +1F900..1F9FF; Supplemental Symbols and Pictographs +1FA00..1FA6F; Chess Symbols +1FA70..1FAFF; Symbols and Pictographs Extended-A +1FB00..1FBFF; Symbols for Legacy Computing +20000..2A6DF; CJK Unified Ideographs Extension B +2A700..2B73F; CJK Unified Ideographs Extension C +2B740..2B81F; CJK Unified Ideographs Extension D +2B820..2CEAF; CJK Unified Ideographs Extension E +2CEB0..2EBEF; CJK Unified Ideographs Extension F +2EBF0..2EE5F; CJK Unified Ideographs Extension I +2F800..2FA1F; CJK Compatibility Ideographs Supplement +30000..3134F; CJK Unified Ideographs Extension G +31350..323AF; CJK Unified Ideographs Extension H +E0000..E007F; Tags +E0100..E01EF; Variation Selectors Supplement +F0000..FFFFF; Supplementary Private Use Area-A +100000..10FFFF; Supplementary Private Use Area-B + +# EOF diff --git a/generate-unicode-blocks-consts b/generate-unicode-blocks-consts new file mode 100755 index 0000000..c8a095b --- /dev/null +++ b/generate-unicode-blocks-consts @@ -0,0 +1,60 @@ +#!/usr/bin/env python +""" +Download Unicode block data and generates rust constants from it. +""" + +import pathlib +import re +import sys +import urllib.request + +BLOCKDEF = re.compile( + r"^(?P[0-9A-Fa-f]+)\.\.(?P[0-9A-Fa-f]+); (?P.*)$" +) + +TXTBLOCKFILE = pathlib.Path("UNIDATA/Blocks.txt") +RSBLOCKFILE = pathlib.Path("src/unicode_blocks.rs") + +resp = urllib.request.urlopen("https://www.unicode.org/Public/UNIDATA/Blocks.txt") +blocksdata = resp.read() +TXTBLOCKFILE.write_bytes(blocksdata) + +blocks = [] +for line in blocksdata.decode().splitlines(): + if match := BLOCKDEF.match(line.strip()): + name = match.group("name") + if name in {"Low Surrogates", "High Surrogates", "High Private Use Surrogates"}: + # Surrogate code points are not valid chars in rust + continue + low = match.group("low") + high = match.group("high") + blocks.append((name, low, high)) + + +def constname(blockname): + return blockname.replace(" ", "_").replace("-", "_").upper() + + +with RSBLOCKFILE.open("w") as file: + print("#![cfg_attr(rustfmt, rustfmt_skip)]", file=file) + print( + f"// Code generated by {sys.argv[0]}. DO NOT EDIT.", + file=file, + ) + print(file=file) + for name, low, high in blocks: + rustrange = f"'\\u{{{low}}}'..='\\u{{{high}}}'" + print( + f"pub const {constname(name)}: std::ops::RangeInclusive = {rustrange};", + file=file, + ) + + print( + """ +/// UNICODE_BLOCKS is a mapping from the pretty block name to the character range. +pub static UNICODE_BLOCKS: phf::Map<&'static str, std::ops::RangeInclusive> = phf::phf_map! {""", + file=file, + ) + for name, _, _ in blocks: + print(f' "{name}" => {constname(name)},', file=file) + print("};", file=file) diff --git a/src/config.rs b/src/config.rs index 5a52418..fc5e305 100644 --- a/src/config.rs +++ b/src/config.rs @@ -35,15 +35,13 @@ impl FromStr for CharacterType { if s == "*" { return Ok(Self::Anything); } - for block in unic_ucd_block::BlockIter::new() { - if block.name == s { - return Ok(Self::Block(block)); - } + if let Some(range) = crate::unicode_blocks::UNICODE_BLOCKS.get(s) { + return Ok(Self::Block(range)); } if let Some((low, high)) = s.split_once("..") { let low = unicode_notation_to_char(low)?; let high = unicode_notation_to_char(high)?; - return Ok(Self::Range(unic_char_range::CharRange { low, high })); + return Ok(Self::Range(low..=high)); } unicode_notation_to_char(s).map(Self::CodePoint) } @@ -158,9 +156,6 @@ pub struct Config { #[cfg(test)] mod tests { - use unic_char_range::CharRange; - use unic_ucd_block::BlockIter; - use super::*; use crate::rules::*; @@ -212,8 +207,6 @@ deny = ["Tibetan"] ) .unwrap(); - let tibetan_block = BlockIter::new().find(|b| b.name == "Tibetan").unwrap(); - let expected_config = Config { global: ConfigRules { default: RuleSet { @@ -235,19 +228,16 @@ deny = ["Tibetan"] rules: ConfigRules { default: RuleSet { allow: vec![ - CharacterType::Block(tibetan_block), + CharacterType::Block(&crate::unicode_blocks::TIBETAN), CharacterType::CodePoint('\u{9000}'), ], - deny: vec![CharacterType::Range(CharRange { - low: '\u{5000}', - high: '\u{5004}', - })], + deny: vec![CharacterType::Range('\u{5000}'..='\u{5004}')], }, code_type_rules: HashMap::from([( CodeType::StringLiteral, RuleSet { allow: vec![], - deny: vec![CharacterType::Block(tibetan_block)], + deny: vec![CharacterType::Block(&crate::unicode_blocks::TIBETAN)], }, )]), }, diff --git a/src/main.rs b/src/main.rs index 2ac1b39..ccca318 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@ use unic_ucd_name::Name; mod config; mod rules; +mod unicode_blocks; // Replaces the previous idea of "RuleChain"s. struct RuleDispatcher { @@ -188,13 +189,10 @@ fn get_user_config() -> anyhow::Result> { /// Comments and string literals allow all unicode except Bidi characters, /// all other kinds of code deny all unicode. fn get_default_config() -> Config { - let ascii = unic_ucd_block::BlockIter::new() - .find(|b| b.name == "Basic Latin") - .unwrap(); Config { global: config::ConfigRules { default: RuleSet { - allow: vec![rules::CharacterType::Block(ascii)], + allow: vec![rules::CharacterType::Block(&unicode_blocks::BASIC_LATIN)], deny: vec![], }, code_type_rules: [ diff --git a/src/rules.rs b/src/rules.rs index 6fb8314..5ba2318 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -45,11 +45,11 @@ pub enum CharacterType { /// Single character (eg. "U+9000") CodePoint(char), /// An inclusive range of characters (eg. "U+1400..U+1409") - Range(unic_char_range::CharRange), + Range(std::ops::RangeInclusive), /// All bidirectional control characters (right to left etc) Bidi, /// Named ranges of characters (eg. "Tibetan", "Box Drawing") - Block(unic_ucd_block::Block), + Block(&'static std::ops::RangeInclusive), /// Any possible character. Anything, } @@ -58,14 +58,14 @@ impl CharacterType { fn matches(&self, c: char) -> bool { match self { Self::CodePoint(rule_char) => *rule_char == c, - Self::Range(range) => range.contains(c), + Self::Range(range) => range.contains(&c), Self::Bidi => [ // List of bidirectional formatting characters from https://en.wikipedia.org/wiki/Trojan_Source '\u{202A}', '\u{202b}', '\u{202c}', '\u{202d}', '\u{202e}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{2069}', ] .contains(&c), - Self::Block(block) => block.range.contains(c), + Self::Block(range) => range.contains(&c), Self::Anything => true, } } @@ -88,7 +88,7 @@ impl PartialEq for CharacterType { (CodePoint(self_c), CodePoint(other_c)) => self_c == other_c, (Range(self_r), Range(other_r)) => self_r == other_r, (Bidi, Bidi) => true, - (Block(self_block), Block(other_block)) => self_block.name == other_block.name, + (Block(self_range), Block(other_range)) => self_range == other_range, (Anything, Anything) => true, _ => false, } diff --git a/src/unicode_blocks.rs b/src/unicode_blocks.rs new file mode 100644 index 0000000..d90e49c --- /dev/null +++ b/src/unicode_blocks.rs @@ -0,0 +1,657 @@ +#![cfg_attr(rustfmt, rustfmt_skip)] +// Code generated by generate-unicode-blocks-consts. DO NOT EDIT. + +pub const BASIC_LATIN: std::ops::RangeInclusive = '\u{0000}'..='\u{007F}'; +pub const LATIN_1_SUPPLEMENT: std::ops::RangeInclusive = '\u{0080}'..='\u{00FF}'; +pub const LATIN_EXTENDED_A: std::ops::RangeInclusive = '\u{0100}'..='\u{017F}'; +pub const LATIN_EXTENDED_B: std::ops::RangeInclusive = '\u{0180}'..='\u{024F}'; +pub const IPA_EXTENSIONS: std::ops::RangeInclusive = '\u{0250}'..='\u{02AF}'; +pub const SPACING_MODIFIER_LETTERS: std::ops::RangeInclusive = '\u{02B0}'..='\u{02FF}'; +pub const COMBINING_DIACRITICAL_MARKS: std::ops::RangeInclusive = '\u{0300}'..='\u{036F}'; +pub const GREEK_AND_COPTIC: std::ops::RangeInclusive = '\u{0370}'..='\u{03FF}'; +pub const CYRILLIC: std::ops::RangeInclusive = '\u{0400}'..='\u{04FF}'; +pub const CYRILLIC_SUPPLEMENT: std::ops::RangeInclusive = '\u{0500}'..='\u{052F}'; +pub const ARMENIAN: std::ops::RangeInclusive = '\u{0530}'..='\u{058F}'; +pub const HEBREW: std::ops::RangeInclusive = '\u{0590}'..='\u{05FF}'; +pub const ARABIC: std::ops::RangeInclusive = '\u{0600}'..='\u{06FF}'; +pub const SYRIAC: std::ops::RangeInclusive = '\u{0700}'..='\u{074F}'; +pub const ARABIC_SUPPLEMENT: std::ops::RangeInclusive = '\u{0750}'..='\u{077F}'; +pub const THAANA: std::ops::RangeInclusive = '\u{0780}'..='\u{07BF}'; +pub const NKO: std::ops::RangeInclusive = '\u{07C0}'..='\u{07FF}'; +pub const SAMARITAN: std::ops::RangeInclusive = '\u{0800}'..='\u{083F}'; +pub const MANDAIC: std::ops::RangeInclusive = '\u{0840}'..='\u{085F}'; +pub const SYRIAC_SUPPLEMENT: std::ops::RangeInclusive = '\u{0860}'..='\u{086F}'; +pub const ARABIC_EXTENDED_B: std::ops::RangeInclusive = '\u{0870}'..='\u{089F}'; +pub const ARABIC_EXTENDED_A: std::ops::RangeInclusive = '\u{08A0}'..='\u{08FF}'; +pub const DEVANAGARI: std::ops::RangeInclusive = '\u{0900}'..='\u{097F}'; +pub const BENGALI: std::ops::RangeInclusive = '\u{0980}'..='\u{09FF}'; +pub const GURMUKHI: std::ops::RangeInclusive = '\u{0A00}'..='\u{0A7F}'; +pub const GUJARATI: std::ops::RangeInclusive = '\u{0A80}'..='\u{0AFF}'; +pub const ORIYA: std::ops::RangeInclusive = '\u{0B00}'..='\u{0B7F}'; +pub const TAMIL: std::ops::RangeInclusive = '\u{0B80}'..='\u{0BFF}'; +pub const TELUGU: std::ops::RangeInclusive = '\u{0C00}'..='\u{0C7F}'; +pub const KANNADA: std::ops::RangeInclusive = '\u{0C80}'..='\u{0CFF}'; +pub const MALAYALAM: std::ops::RangeInclusive = '\u{0D00}'..='\u{0D7F}'; +pub const SINHALA: std::ops::RangeInclusive = '\u{0D80}'..='\u{0DFF}'; +pub const THAI: std::ops::RangeInclusive = '\u{0E00}'..='\u{0E7F}'; +pub const LAO: std::ops::RangeInclusive = '\u{0E80}'..='\u{0EFF}'; +pub const TIBETAN: std::ops::RangeInclusive = '\u{0F00}'..='\u{0FFF}'; +pub const MYANMAR: std::ops::RangeInclusive = '\u{1000}'..='\u{109F}'; +pub const GEORGIAN: std::ops::RangeInclusive = '\u{10A0}'..='\u{10FF}'; +pub const HANGUL_JAMO: std::ops::RangeInclusive = '\u{1100}'..='\u{11FF}'; +pub const ETHIOPIC: std::ops::RangeInclusive = '\u{1200}'..='\u{137F}'; +pub const ETHIOPIC_SUPPLEMENT: std::ops::RangeInclusive = '\u{1380}'..='\u{139F}'; +pub const CHEROKEE: std::ops::RangeInclusive = '\u{13A0}'..='\u{13FF}'; +pub const UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS: std::ops::RangeInclusive = '\u{1400}'..='\u{167F}'; +pub const OGHAM: std::ops::RangeInclusive = '\u{1680}'..='\u{169F}'; +pub const RUNIC: std::ops::RangeInclusive = '\u{16A0}'..='\u{16FF}'; +pub const TAGALOG: std::ops::RangeInclusive = '\u{1700}'..='\u{171F}'; +pub const HANUNOO: std::ops::RangeInclusive = '\u{1720}'..='\u{173F}'; +pub const BUHID: std::ops::RangeInclusive = '\u{1740}'..='\u{175F}'; +pub const TAGBANWA: std::ops::RangeInclusive = '\u{1760}'..='\u{177F}'; +pub const KHMER: std::ops::RangeInclusive = '\u{1780}'..='\u{17FF}'; +pub const MONGOLIAN: std::ops::RangeInclusive = '\u{1800}'..='\u{18AF}'; +pub const UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED: std::ops::RangeInclusive = '\u{18B0}'..='\u{18FF}'; +pub const LIMBU: std::ops::RangeInclusive = '\u{1900}'..='\u{194F}'; +pub const TAI_LE: std::ops::RangeInclusive = '\u{1950}'..='\u{197F}'; +pub const NEW_TAI_LUE: std::ops::RangeInclusive = '\u{1980}'..='\u{19DF}'; +pub const KHMER_SYMBOLS: std::ops::RangeInclusive = '\u{19E0}'..='\u{19FF}'; +pub const BUGINESE: std::ops::RangeInclusive = '\u{1A00}'..='\u{1A1F}'; +pub const TAI_THAM: std::ops::RangeInclusive = '\u{1A20}'..='\u{1AAF}'; +pub const COMBINING_DIACRITICAL_MARKS_EXTENDED: std::ops::RangeInclusive = '\u{1AB0}'..='\u{1AFF}'; +pub const BALINESE: std::ops::RangeInclusive = '\u{1B00}'..='\u{1B7F}'; +pub const SUNDANESE: std::ops::RangeInclusive = '\u{1B80}'..='\u{1BBF}'; +pub const BATAK: std::ops::RangeInclusive = '\u{1BC0}'..='\u{1BFF}'; +pub const LEPCHA: std::ops::RangeInclusive = '\u{1C00}'..='\u{1C4F}'; +pub const OL_CHIKI: std::ops::RangeInclusive = '\u{1C50}'..='\u{1C7F}'; +pub const CYRILLIC_EXTENDED_C: std::ops::RangeInclusive = '\u{1C80}'..='\u{1C8F}'; +pub const GEORGIAN_EXTENDED: std::ops::RangeInclusive = '\u{1C90}'..='\u{1CBF}'; +pub const SUNDANESE_SUPPLEMENT: std::ops::RangeInclusive = '\u{1CC0}'..='\u{1CCF}'; +pub const VEDIC_EXTENSIONS: std::ops::RangeInclusive = '\u{1CD0}'..='\u{1CFF}'; +pub const PHONETIC_EXTENSIONS: std::ops::RangeInclusive = '\u{1D00}'..='\u{1D7F}'; +pub const PHONETIC_EXTENSIONS_SUPPLEMENT: std::ops::RangeInclusive = '\u{1D80}'..='\u{1DBF}'; +pub const COMBINING_DIACRITICAL_MARKS_SUPPLEMENT: std::ops::RangeInclusive = '\u{1DC0}'..='\u{1DFF}'; +pub const LATIN_EXTENDED_ADDITIONAL: std::ops::RangeInclusive = '\u{1E00}'..='\u{1EFF}'; +pub const GREEK_EXTENDED: std::ops::RangeInclusive = '\u{1F00}'..='\u{1FFF}'; +pub const GENERAL_PUNCTUATION: std::ops::RangeInclusive = '\u{2000}'..='\u{206F}'; +pub const SUPERSCRIPTS_AND_SUBSCRIPTS: std::ops::RangeInclusive = '\u{2070}'..='\u{209F}'; +pub const CURRENCY_SYMBOLS: std::ops::RangeInclusive = '\u{20A0}'..='\u{20CF}'; +pub const COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS: std::ops::RangeInclusive = '\u{20D0}'..='\u{20FF}'; +pub const LETTERLIKE_SYMBOLS: std::ops::RangeInclusive = '\u{2100}'..='\u{214F}'; +pub const NUMBER_FORMS: std::ops::RangeInclusive = '\u{2150}'..='\u{218F}'; +pub const ARROWS: std::ops::RangeInclusive = '\u{2190}'..='\u{21FF}'; +pub const MATHEMATICAL_OPERATORS: std::ops::RangeInclusive = '\u{2200}'..='\u{22FF}'; +pub const MISCELLANEOUS_TECHNICAL: std::ops::RangeInclusive = '\u{2300}'..='\u{23FF}'; +pub const CONTROL_PICTURES: std::ops::RangeInclusive = '\u{2400}'..='\u{243F}'; +pub const OPTICAL_CHARACTER_RECOGNITION: std::ops::RangeInclusive = '\u{2440}'..='\u{245F}'; +pub const ENCLOSED_ALPHANUMERICS: std::ops::RangeInclusive = '\u{2460}'..='\u{24FF}'; +pub const BOX_DRAWING: std::ops::RangeInclusive = '\u{2500}'..='\u{257F}'; +pub const BLOCK_ELEMENTS: std::ops::RangeInclusive = '\u{2580}'..='\u{259F}'; +pub const GEOMETRIC_SHAPES: std::ops::RangeInclusive = '\u{25A0}'..='\u{25FF}'; +pub const MISCELLANEOUS_SYMBOLS: std::ops::RangeInclusive = '\u{2600}'..='\u{26FF}'; +pub const DINGBATS: std::ops::RangeInclusive = '\u{2700}'..='\u{27BF}'; +pub const MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A: std::ops::RangeInclusive = '\u{27C0}'..='\u{27EF}'; +pub const SUPPLEMENTAL_ARROWS_A: std::ops::RangeInclusive = '\u{27F0}'..='\u{27FF}'; +pub const BRAILLE_PATTERNS: std::ops::RangeInclusive = '\u{2800}'..='\u{28FF}'; +pub const SUPPLEMENTAL_ARROWS_B: std::ops::RangeInclusive = '\u{2900}'..='\u{297F}'; +pub const MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B: std::ops::RangeInclusive = '\u{2980}'..='\u{29FF}'; +pub const SUPPLEMENTAL_MATHEMATICAL_OPERATORS: std::ops::RangeInclusive = '\u{2A00}'..='\u{2AFF}'; +pub const MISCELLANEOUS_SYMBOLS_AND_ARROWS: std::ops::RangeInclusive = '\u{2B00}'..='\u{2BFF}'; +pub const GLAGOLITIC: std::ops::RangeInclusive = '\u{2C00}'..='\u{2C5F}'; +pub const LATIN_EXTENDED_C: std::ops::RangeInclusive = '\u{2C60}'..='\u{2C7F}'; +pub const COPTIC: std::ops::RangeInclusive = '\u{2C80}'..='\u{2CFF}'; +pub const GEORGIAN_SUPPLEMENT: std::ops::RangeInclusive = '\u{2D00}'..='\u{2D2F}'; +pub const TIFINAGH: std::ops::RangeInclusive = '\u{2D30}'..='\u{2D7F}'; +pub const ETHIOPIC_EXTENDED: std::ops::RangeInclusive = '\u{2D80}'..='\u{2DDF}'; +pub const CYRILLIC_EXTENDED_A: std::ops::RangeInclusive = '\u{2DE0}'..='\u{2DFF}'; +pub const SUPPLEMENTAL_PUNCTUATION: std::ops::RangeInclusive = '\u{2E00}'..='\u{2E7F}'; +pub const CJK_RADICALS_SUPPLEMENT: std::ops::RangeInclusive = '\u{2E80}'..='\u{2EFF}'; +pub const KANGXI_RADICALS: std::ops::RangeInclusive = '\u{2F00}'..='\u{2FDF}'; +pub const IDEOGRAPHIC_DESCRIPTION_CHARACTERS: std::ops::RangeInclusive = '\u{2FF0}'..='\u{2FFF}'; +pub const CJK_SYMBOLS_AND_PUNCTUATION: std::ops::RangeInclusive = '\u{3000}'..='\u{303F}'; +pub const HIRAGANA: std::ops::RangeInclusive = '\u{3040}'..='\u{309F}'; +pub const KATAKANA: std::ops::RangeInclusive = '\u{30A0}'..='\u{30FF}'; +pub const BOPOMOFO: std::ops::RangeInclusive = '\u{3100}'..='\u{312F}'; +pub const HANGUL_COMPATIBILITY_JAMO: std::ops::RangeInclusive = '\u{3130}'..='\u{318F}'; +pub const KANBUN: std::ops::RangeInclusive = '\u{3190}'..='\u{319F}'; +pub const BOPOMOFO_EXTENDED: std::ops::RangeInclusive = '\u{31A0}'..='\u{31BF}'; +pub const CJK_STROKES: std::ops::RangeInclusive = '\u{31C0}'..='\u{31EF}'; +pub const KATAKANA_PHONETIC_EXTENSIONS: std::ops::RangeInclusive = '\u{31F0}'..='\u{31FF}'; +pub const ENCLOSED_CJK_LETTERS_AND_MONTHS: std::ops::RangeInclusive = '\u{3200}'..='\u{32FF}'; +pub const CJK_COMPATIBILITY: std::ops::RangeInclusive = '\u{3300}'..='\u{33FF}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: std::ops::RangeInclusive = '\u{3400}'..='\u{4DBF}'; +pub const YIJING_HEXAGRAM_SYMBOLS: std::ops::RangeInclusive = '\u{4DC0}'..='\u{4DFF}'; +pub const CJK_UNIFIED_IDEOGRAPHS: std::ops::RangeInclusive = '\u{4E00}'..='\u{9FFF}'; +pub const YI_SYLLABLES: std::ops::RangeInclusive = '\u{A000}'..='\u{A48F}'; +pub const YI_RADICALS: std::ops::RangeInclusive = '\u{A490}'..='\u{A4CF}'; +pub const LISU: std::ops::RangeInclusive = '\u{A4D0}'..='\u{A4FF}'; +pub const VAI: std::ops::RangeInclusive = '\u{A500}'..='\u{A63F}'; +pub const CYRILLIC_EXTENDED_B: std::ops::RangeInclusive = '\u{A640}'..='\u{A69F}'; +pub const BAMUM: std::ops::RangeInclusive = '\u{A6A0}'..='\u{A6FF}'; +pub const MODIFIER_TONE_LETTERS: std::ops::RangeInclusive = '\u{A700}'..='\u{A71F}'; +pub const LATIN_EXTENDED_D: std::ops::RangeInclusive = '\u{A720}'..='\u{A7FF}'; +pub const SYLOTI_NAGRI: std::ops::RangeInclusive = '\u{A800}'..='\u{A82F}'; +pub const COMMON_INDIC_NUMBER_FORMS: std::ops::RangeInclusive = '\u{A830}'..='\u{A83F}'; +pub const PHAGS_PA: std::ops::RangeInclusive = '\u{A840}'..='\u{A87F}'; +pub const SAURASHTRA: std::ops::RangeInclusive = '\u{A880}'..='\u{A8DF}'; +pub const DEVANAGARI_EXTENDED: std::ops::RangeInclusive = '\u{A8E0}'..='\u{A8FF}'; +pub const KAYAH_LI: std::ops::RangeInclusive = '\u{A900}'..='\u{A92F}'; +pub const REJANG: std::ops::RangeInclusive = '\u{A930}'..='\u{A95F}'; +pub const HANGUL_JAMO_EXTENDED_A: std::ops::RangeInclusive = '\u{A960}'..='\u{A97F}'; +pub const JAVANESE: std::ops::RangeInclusive = '\u{A980}'..='\u{A9DF}'; +pub const MYANMAR_EXTENDED_B: std::ops::RangeInclusive = '\u{A9E0}'..='\u{A9FF}'; +pub const CHAM: std::ops::RangeInclusive = '\u{AA00}'..='\u{AA5F}'; +pub const MYANMAR_EXTENDED_A: std::ops::RangeInclusive = '\u{AA60}'..='\u{AA7F}'; +pub const TAI_VIET: std::ops::RangeInclusive = '\u{AA80}'..='\u{AADF}'; +pub const MEETEI_MAYEK_EXTENSIONS: std::ops::RangeInclusive = '\u{AAE0}'..='\u{AAFF}'; +pub const ETHIOPIC_EXTENDED_A: std::ops::RangeInclusive = '\u{AB00}'..='\u{AB2F}'; +pub const LATIN_EXTENDED_E: std::ops::RangeInclusive = '\u{AB30}'..='\u{AB6F}'; +pub const CHEROKEE_SUPPLEMENT: std::ops::RangeInclusive = '\u{AB70}'..='\u{ABBF}'; +pub const MEETEI_MAYEK: std::ops::RangeInclusive = '\u{ABC0}'..='\u{ABFF}'; +pub const HANGUL_SYLLABLES: std::ops::RangeInclusive = '\u{AC00}'..='\u{D7AF}'; +pub const HANGUL_JAMO_EXTENDED_B: std::ops::RangeInclusive = '\u{D7B0}'..='\u{D7FF}'; +pub const PRIVATE_USE_AREA: std::ops::RangeInclusive = '\u{E000}'..='\u{F8FF}'; +pub const CJK_COMPATIBILITY_IDEOGRAPHS: std::ops::RangeInclusive = '\u{F900}'..='\u{FAFF}'; +pub const ALPHABETIC_PRESENTATION_FORMS: std::ops::RangeInclusive = '\u{FB00}'..='\u{FB4F}'; +pub const ARABIC_PRESENTATION_FORMS_A: std::ops::RangeInclusive = '\u{FB50}'..='\u{FDFF}'; +pub const VARIATION_SELECTORS: std::ops::RangeInclusive = '\u{FE00}'..='\u{FE0F}'; +pub const VERTICAL_FORMS: std::ops::RangeInclusive = '\u{FE10}'..='\u{FE1F}'; +pub const COMBINING_HALF_MARKS: std::ops::RangeInclusive = '\u{FE20}'..='\u{FE2F}'; +pub const CJK_COMPATIBILITY_FORMS: std::ops::RangeInclusive = '\u{FE30}'..='\u{FE4F}'; +pub const SMALL_FORM_VARIANTS: std::ops::RangeInclusive = '\u{FE50}'..='\u{FE6F}'; +pub const ARABIC_PRESENTATION_FORMS_B: std::ops::RangeInclusive = '\u{FE70}'..='\u{FEFF}'; +pub const HALFWIDTH_AND_FULLWIDTH_FORMS: std::ops::RangeInclusive = '\u{FF00}'..='\u{FFEF}'; +pub const SPECIALS: std::ops::RangeInclusive = '\u{FFF0}'..='\u{FFFF}'; +pub const LINEAR_B_SYLLABARY: std::ops::RangeInclusive = '\u{10000}'..='\u{1007F}'; +pub const LINEAR_B_IDEOGRAMS: std::ops::RangeInclusive = '\u{10080}'..='\u{100FF}'; +pub const AEGEAN_NUMBERS: std::ops::RangeInclusive = '\u{10100}'..='\u{1013F}'; +pub const ANCIENT_GREEK_NUMBERS: std::ops::RangeInclusive = '\u{10140}'..='\u{1018F}'; +pub const ANCIENT_SYMBOLS: std::ops::RangeInclusive = '\u{10190}'..='\u{101CF}'; +pub const PHAISTOS_DISC: std::ops::RangeInclusive = '\u{101D0}'..='\u{101FF}'; +pub const LYCIAN: std::ops::RangeInclusive = '\u{10280}'..='\u{1029F}'; +pub const CARIAN: std::ops::RangeInclusive = '\u{102A0}'..='\u{102DF}'; +pub const COPTIC_EPACT_NUMBERS: std::ops::RangeInclusive = '\u{102E0}'..='\u{102FF}'; +pub const OLD_ITALIC: std::ops::RangeInclusive = '\u{10300}'..='\u{1032F}'; +pub const GOTHIC: std::ops::RangeInclusive = '\u{10330}'..='\u{1034F}'; +pub const OLD_PERMIC: std::ops::RangeInclusive = '\u{10350}'..='\u{1037F}'; +pub const UGARITIC: std::ops::RangeInclusive = '\u{10380}'..='\u{1039F}'; +pub const OLD_PERSIAN: std::ops::RangeInclusive = '\u{103A0}'..='\u{103DF}'; +pub const DESERET: std::ops::RangeInclusive = '\u{10400}'..='\u{1044F}'; +pub const SHAVIAN: std::ops::RangeInclusive = '\u{10450}'..='\u{1047F}'; +pub const OSMANYA: std::ops::RangeInclusive = '\u{10480}'..='\u{104AF}'; +pub const OSAGE: std::ops::RangeInclusive = '\u{104B0}'..='\u{104FF}'; +pub const ELBASAN: std::ops::RangeInclusive = '\u{10500}'..='\u{1052F}'; +pub const CAUCASIAN_ALBANIAN: std::ops::RangeInclusive = '\u{10530}'..='\u{1056F}'; +pub const VITHKUQI: std::ops::RangeInclusive = '\u{10570}'..='\u{105BF}'; +pub const LINEAR_A: std::ops::RangeInclusive = '\u{10600}'..='\u{1077F}'; +pub const LATIN_EXTENDED_F: std::ops::RangeInclusive = '\u{10780}'..='\u{107BF}'; +pub const CYPRIOT_SYLLABARY: std::ops::RangeInclusive = '\u{10800}'..='\u{1083F}'; +pub const IMPERIAL_ARAMAIC: std::ops::RangeInclusive = '\u{10840}'..='\u{1085F}'; +pub const PALMYRENE: std::ops::RangeInclusive = '\u{10860}'..='\u{1087F}'; +pub const NABATAEAN: std::ops::RangeInclusive = '\u{10880}'..='\u{108AF}'; +pub const HATRAN: std::ops::RangeInclusive = '\u{108E0}'..='\u{108FF}'; +pub const PHOENICIAN: std::ops::RangeInclusive = '\u{10900}'..='\u{1091F}'; +pub const LYDIAN: std::ops::RangeInclusive = '\u{10920}'..='\u{1093F}'; +pub const MEROITIC_HIEROGLYPHS: std::ops::RangeInclusive = '\u{10980}'..='\u{1099F}'; +pub const MEROITIC_CURSIVE: std::ops::RangeInclusive = '\u{109A0}'..='\u{109FF}'; +pub const KHAROSHTHI: std::ops::RangeInclusive = '\u{10A00}'..='\u{10A5F}'; +pub const OLD_SOUTH_ARABIAN: std::ops::RangeInclusive = '\u{10A60}'..='\u{10A7F}'; +pub const OLD_NORTH_ARABIAN: std::ops::RangeInclusive = '\u{10A80}'..='\u{10A9F}'; +pub const MANICHAEAN: std::ops::RangeInclusive = '\u{10AC0}'..='\u{10AFF}'; +pub const AVESTAN: std::ops::RangeInclusive = '\u{10B00}'..='\u{10B3F}'; +pub const INSCRIPTIONAL_PARTHIAN: std::ops::RangeInclusive = '\u{10B40}'..='\u{10B5F}'; +pub const INSCRIPTIONAL_PAHLAVI: std::ops::RangeInclusive = '\u{10B60}'..='\u{10B7F}'; +pub const PSALTER_PAHLAVI: std::ops::RangeInclusive = '\u{10B80}'..='\u{10BAF}'; +pub const OLD_TURKIC: std::ops::RangeInclusive = '\u{10C00}'..='\u{10C4F}'; +pub const OLD_HUNGARIAN: std::ops::RangeInclusive = '\u{10C80}'..='\u{10CFF}'; +pub const HANIFI_ROHINGYA: std::ops::RangeInclusive = '\u{10D00}'..='\u{10D3F}'; +pub const RUMI_NUMERAL_SYMBOLS: std::ops::RangeInclusive = '\u{10E60}'..='\u{10E7F}'; +pub const YEZIDI: std::ops::RangeInclusive = '\u{10E80}'..='\u{10EBF}'; +pub const ARABIC_EXTENDED_C: std::ops::RangeInclusive = '\u{10EC0}'..='\u{10EFF}'; +pub const OLD_SOGDIAN: std::ops::RangeInclusive = '\u{10F00}'..='\u{10F2F}'; +pub const SOGDIAN: std::ops::RangeInclusive = '\u{10F30}'..='\u{10F6F}'; +pub const OLD_UYGHUR: std::ops::RangeInclusive = '\u{10F70}'..='\u{10FAF}'; +pub const CHORASMIAN: std::ops::RangeInclusive = '\u{10FB0}'..='\u{10FDF}'; +pub const ELYMAIC: std::ops::RangeInclusive = '\u{10FE0}'..='\u{10FFF}'; +pub const BRAHMI: std::ops::RangeInclusive = '\u{11000}'..='\u{1107F}'; +pub const KAITHI: std::ops::RangeInclusive = '\u{11080}'..='\u{110CF}'; +pub const SORA_SOMPENG: std::ops::RangeInclusive = '\u{110D0}'..='\u{110FF}'; +pub const CHAKMA: std::ops::RangeInclusive = '\u{11100}'..='\u{1114F}'; +pub const MAHAJANI: std::ops::RangeInclusive = '\u{11150}'..='\u{1117F}'; +pub const SHARADA: std::ops::RangeInclusive = '\u{11180}'..='\u{111DF}'; +pub const SINHALA_ARCHAIC_NUMBERS: std::ops::RangeInclusive = '\u{111E0}'..='\u{111FF}'; +pub const KHOJKI: std::ops::RangeInclusive = '\u{11200}'..='\u{1124F}'; +pub const MULTANI: std::ops::RangeInclusive = '\u{11280}'..='\u{112AF}'; +pub const KHUDAWADI: std::ops::RangeInclusive = '\u{112B0}'..='\u{112FF}'; +pub const GRANTHA: std::ops::RangeInclusive = '\u{11300}'..='\u{1137F}'; +pub const NEWA: std::ops::RangeInclusive = '\u{11400}'..='\u{1147F}'; +pub const TIRHUTA: std::ops::RangeInclusive = '\u{11480}'..='\u{114DF}'; +pub const SIDDHAM: std::ops::RangeInclusive = '\u{11580}'..='\u{115FF}'; +pub const MODI: std::ops::RangeInclusive = '\u{11600}'..='\u{1165F}'; +pub const MONGOLIAN_SUPPLEMENT: std::ops::RangeInclusive = '\u{11660}'..='\u{1167F}'; +pub const TAKRI: std::ops::RangeInclusive = '\u{11680}'..='\u{116CF}'; +pub const AHOM: std::ops::RangeInclusive = '\u{11700}'..='\u{1174F}'; +pub const DOGRA: std::ops::RangeInclusive = '\u{11800}'..='\u{1184F}'; +pub const WARANG_CITI: std::ops::RangeInclusive = '\u{118A0}'..='\u{118FF}'; +pub const DIVES_AKURU: std::ops::RangeInclusive = '\u{11900}'..='\u{1195F}'; +pub const NANDINAGARI: std::ops::RangeInclusive = '\u{119A0}'..='\u{119FF}'; +pub const ZANABAZAR_SQUARE: std::ops::RangeInclusive = '\u{11A00}'..='\u{11A4F}'; +pub const SOYOMBO: std::ops::RangeInclusive = '\u{11A50}'..='\u{11AAF}'; +pub const UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A: std::ops::RangeInclusive = '\u{11AB0}'..='\u{11ABF}'; +pub const PAU_CIN_HAU: std::ops::RangeInclusive = '\u{11AC0}'..='\u{11AFF}'; +pub const DEVANAGARI_EXTENDED_A: std::ops::RangeInclusive = '\u{11B00}'..='\u{11B5F}'; +pub const BHAIKSUKI: std::ops::RangeInclusive = '\u{11C00}'..='\u{11C6F}'; +pub const MARCHEN: std::ops::RangeInclusive = '\u{11C70}'..='\u{11CBF}'; +pub const MASARAM_GONDI: std::ops::RangeInclusive = '\u{11D00}'..='\u{11D5F}'; +pub const GUNJALA_GONDI: std::ops::RangeInclusive = '\u{11D60}'..='\u{11DAF}'; +pub const MAKASAR: std::ops::RangeInclusive = '\u{11EE0}'..='\u{11EFF}'; +pub const KAWI: std::ops::RangeInclusive = '\u{11F00}'..='\u{11F5F}'; +pub const LISU_SUPPLEMENT: std::ops::RangeInclusive = '\u{11FB0}'..='\u{11FBF}'; +pub const TAMIL_SUPPLEMENT: std::ops::RangeInclusive = '\u{11FC0}'..='\u{11FFF}'; +pub const CUNEIFORM: std::ops::RangeInclusive = '\u{12000}'..='\u{123FF}'; +pub const CUNEIFORM_NUMBERS_AND_PUNCTUATION: std::ops::RangeInclusive = '\u{12400}'..='\u{1247F}'; +pub const EARLY_DYNASTIC_CUNEIFORM: std::ops::RangeInclusive = '\u{12480}'..='\u{1254F}'; +pub const CYPRO_MINOAN: std::ops::RangeInclusive = '\u{12F90}'..='\u{12FFF}'; +pub const EGYPTIAN_HIEROGLYPHS: std::ops::RangeInclusive = '\u{13000}'..='\u{1342F}'; +pub const EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS: std::ops::RangeInclusive = '\u{13430}'..='\u{1345F}'; +pub const ANATOLIAN_HIEROGLYPHS: std::ops::RangeInclusive = '\u{14400}'..='\u{1467F}'; +pub const BAMUM_SUPPLEMENT: std::ops::RangeInclusive = '\u{16800}'..='\u{16A3F}'; +pub const MRO: std::ops::RangeInclusive = '\u{16A40}'..='\u{16A6F}'; +pub const TANGSA: std::ops::RangeInclusive = '\u{16A70}'..='\u{16ACF}'; +pub const BASSA_VAH: std::ops::RangeInclusive = '\u{16AD0}'..='\u{16AFF}'; +pub const PAHAWH_HMONG: std::ops::RangeInclusive = '\u{16B00}'..='\u{16B8F}'; +pub const MEDEFAIDRIN: std::ops::RangeInclusive = '\u{16E40}'..='\u{16E9F}'; +pub const MIAO: std::ops::RangeInclusive = '\u{16F00}'..='\u{16F9F}'; +pub const IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION: std::ops::RangeInclusive = '\u{16FE0}'..='\u{16FFF}'; +pub const TANGUT: std::ops::RangeInclusive = '\u{17000}'..='\u{187FF}'; +pub const TANGUT_COMPONENTS: std::ops::RangeInclusive = '\u{18800}'..='\u{18AFF}'; +pub const KHITAN_SMALL_SCRIPT: std::ops::RangeInclusive = '\u{18B00}'..='\u{18CFF}'; +pub const TANGUT_SUPPLEMENT: std::ops::RangeInclusive = '\u{18D00}'..='\u{18D7F}'; +pub const KANA_EXTENDED_B: std::ops::RangeInclusive = '\u{1AFF0}'..='\u{1AFFF}'; +pub const KANA_SUPPLEMENT: std::ops::RangeInclusive = '\u{1B000}'..='\u{1B0FF}'; +pub const KANA_EXTENDED_A: std::ops::RangeInclusive = '\u{1B100}'..='\u{1B12F}'; +pub const SMALL_KANA_EXTENSION: std::ops::RangeInclusive = '\u{1B130}'..='\u{1B16F}'; +pub const NUSHU: std::ops::RangeInclusive = '\u{1B170}'..='\u{1B2FF}'; +pub const DUPLOYAN: std::ops::RangeInclusive = '\u{1BC00}'..='\u{1BC9F}'; +pub const SHORTHAND_FORMAT_CONTROLS: std::ops::RangeInclusive = '\u{1BCA0}'..='\u{1BCAF}'; +pub const ZNAMENNY_MUSICAL_NOTATION: std::ops::RangeInclusive = '\u{1CF00}'..='\u{1CFCF}'; +pub const BYZANTINE_MUSICAL_SYMBOLS: std::ops::RangeInclusive = '\u{1D000}'..='\u{1D0FF}'; +pub const MUSICAL_SYMBOLS: std::ops::RangeInclusive = '\u{1D100}'..='\u{1D1FF}'; +pub const ANCIENT_GREEK_MUSICAL_NOTATION: std::ops::RangeInclusive = '\u{1D200}'..='\u{1D24F}'; +pub const KAKTOVIK_NUMERALS: std::ops::RangeInclusive = '\u{1D2C0}'..='\u{1D2DF}'; +pub const MAYAN_NUMERALS: std::ops::RangeInclusive = '\u{1D2E0}'..='\u{1D2FF}'; +pub const TAI_XUAN_JING_SYMBOLS: std::ops::RangeInclusive = '\u{1D300}'..='\u{1D35F}'; +pub const COUNTING_ROD_NUMERALS: std::ops::RangeInclusive = '\u{1D360}'..='\u{1D37F}'; +pub const MATHEMATICAL_ALPHANUMERIC_SYMBOLS: std::ops::RangeInclusive = '\u{1D400}'..='\u{1D7FF}'; +pub const SUTTON_SIGNWRITING: std::ops::RangeInclusive = '\u{1D800}'..='\u{1DAAF}'; +pub const LATIN_EXTENDED_G: std::ops::RangeInclusive = '\u{1DF00}'..='\u{1DFFF}'; +pub const GLAGOLITIC_SUPPLEMENT: std::ops::RangeInclusive = '\u{1E000}'..='\u{1E02F}'; +pub const CYRILLIC_EXTENDED_D: std::ops::RangeInclusive = '\u{1E030}'..='\u{1E08F}'; +pub const NYIAKENG_PUACHUE_HMONG: std::ops::RangeInclusive = '\u{1E100}'..='\u{1E14F}'; +pub const TOTO: std::ops::RangeInclusive = '\u{1E290}'..='\u{1E2BF}'; +pub const WANCHO: std::ops::RangeInclusive = '\u{1E2C0}'..='\u{1E2FF}'; +pub const NAG_MUNDARI: std::ops::RangeInclusive = '\u{1E4D0}'..='\u{1E4FF}'; +pub const ETHIOPIC_EXTENDED_B: std::ops::RangeInclusive = '\u{1E7E0}'..='\u{1E7FF}'; +pub const MENDE_KIKAKUI: std::ops::RangeInclusive = '\u{1E800}'..='\u{1E8DF}'; +pub const ADLAM: std::ops::RangeInclusive = '\u{1E900}'..='\u{1E95F}'; +pub const INDIC_SIYAQ_NUMBERS: std::ops::RangeInclusive = '\u{1EC70}'..='\u{1ECBF}'; +pub const OTTOMAN_SIYAQ_NUMBERS: std::ops::RangeInclusive = '\u{1ED00}'..='\u{1ED4F}'; +pub const ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS: std::ops::RangeInclusive = '\u{1EE00}'..='\u{1EEFF}'; +pub const MAHJONG_TILES: std::ops::RangeInclusive = '\u{1F000}'..='\u{1F02F}'; +pub const DOMINO_TILES: std::ops::RangeInclusive = '\u{1F030}'..='\u{1F09F}'; +pub const PLAYING_CARDS: std::ops::RangeInclusive = '\u{1F0A0}'..='\u{1F0FF}'; +pub const ENCLOSED_ALPHANUMERIC_SUPPLEMENT: std::ops::RangeInclusive = '\u{1F100}'..='\u{1F1FF}'; +pub const ENCLOSED_IDEOGRAPHIC_SUPPLEMENT: std::ops::RangeInclusive = '\u{1F200}'..='\u{1F2FF}'; +pub const MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS: std::ops::RangeInclusive = '\u{1F300}'..='\u{1F5FF}'; +pub const EMOTICONS: std::ops::RangeInclusive = '\u{1F600}'..='\u{1F64F}'; +pub const ORNAMENTAL_DINGBATS: std::ops::RangeInclusive = '\u{1F650}'..='\u{1F67F}'; +pub const TRANSPORT_AND_MAP_SYMBOLS: std::ops::RangeInclusive = '\u{1F680}'..='\u{1F6FF}'; +pub const ALCHEMICAL_SYMBOLS: std::ops::RangeInclusive = '\u{1F700}'..='\u{1F77F}'; +pub const GEOMETRIC_SHAPES_EXTENDED: std::ops::RangeInclusive = '\u{1F780}'..='\u{1F7FF}'; +pub const SUPPLEMENTAL_ARROWS_C: std::ops::RangeInclusive = '\u{1F800}'..='\u{1F8FF}'; +pub const SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS: std::ops::RangeInclusive = '\u{1F900}'..='\u{1F9FF}'; +pub const CHESS_SYMBOLS: std::ops::RangeInclusive = '\u{1FA00}'..='\u{1FA6F}'; +pub const SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A: std::ops::RangeInclusive = '\u{1FA70}'..='\u{1FAFF}'; +pub const SYMBOLS_FOR_LEGACY_COMPUTING: std::ops::RangeInclusive = '\u{1FB00}'..='\u{1FBFF}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: std::ops::RangeInclusive = '\u{20000}'..='\u{2A6DF}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C: std::ops::RangeInclusive = '\u{2A700}'..='\u{2B73F}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D: std::ops::RangeInclusive = '\u{2B740}'..='\u{2B81F}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E: std::ops::RangeInclusive = '\u{2B820}'..='\u{2CEAF}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F: std::ops::RangeInclusive = '\u{2CEB0}'..='\u{2EBEF}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I: std::ops::RangeInclusive = '\u{2EBF0}'..='\u{2EE5F}'; +pub const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT: std::ops::RangeInclusive = '\u{2F800}'..='\u{2FA1F}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G: std::ops::RangeInclusive = '\u{30000}'..='\u{3134F}'; +pub const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H: std::ops::RangeInclusive = '\u{31350}'..='\u{323AF}'; +pub const TAGS: std::ops::RangeInclusive = '\u{E0000}'..='\u{E007F}'; +pub const VARIATION_SELECTORS_SUPPLEMENT: std::ops::RangeInclusive = '\u{E0100}'..='\u{E01EF}'; +pub const SUPPLEMENTARY_PRIVATE_USE_AREA_A: std::ops::RangeInclusive = '\u{F0000}'..='\u{FFFFF}'; +pub const SUPPLEMENTARY_PRIVATE_USE_AREA_B: std::ops::RangeInclusive = '\u{100000}'..='\u{10FFFF}'; + +/// UNICODE_BLOCKS is a mapping from the pretty block name to the character range. +pub static UNICODE_BLOCKS: phf::Map<&'static str, std::ops::RangeInclusive> = phf::phf_map! { + "Basic Latin" => BASIC_LATIN, + "Latin-1 Supplement" => LATIN_1_SUPPLEMENT, + "Latin Extended-A" => LATIN_EXTENDED_A, + "Latin Extended-B" => LATIN_EXTENDED_B, + "IPA Extensions" => IPA_EXTENSIONS, + "Spacing Modifier Letters" => SPACING_MODIFIER_LETTERS, + "Combining Diacritical Marks" => COMBINING_DIACRITICAL_MARKS, + "Greek and Coptic" => GREEK_AND_COPTIC, + "Cyrillic" => CYRILLIC, + "Cyrillic Supplement" => CYRILLIC_SUPPLEMENT, + "Armenian" => ARMENIAN, + "Hebrew" => HEBREW, + "Arabic" => ARABIC, + "Syriac" => SYRIAC, + "Arabic Supplement" => ARABIC_SUPPLEMENT, + "Thaana" => THAANA, + "NKo" => NKO, + "Samaritan" => SAMARITAN, + "Mandaic" => MANDAIC, + "Syriac Supplement" => SYRIAC_SUPPLEMENT, + "Arabic Extended-B" => ARABIC_EXTENDED_B, + "Arabic Extended-A" => ARABIC_EXTENDED_A, + "Devanagari" => DEVANAGARI, + "Bengali" => BENGALI, + "Gurmukhi" => GURMUKHI, + "Gujarati" => GUJARATI, + "Oriya" => ORIYA, + "Tamil" => TAMIL, + "Telugu" => TELUGU, + "Kannada" => KANNADA, + "Malayalam" => MALAYALAM, + "Sinhala" => SINHALA, + "Thai" => THAI, + "Lao" => LAO, + "Tibetan" => TIBETAN, + "Myanmar" => MYANMAR, + "Georgian" => GEORGIAN, + "Hangul Jamo" => HANGUL_JAMO, + "Ethiopic" => ETHIOPIC, + "Ethiopic Supplement" => ETHIOPIC_SUPPLEMENT, + "Cherokee" => CHEROKEE, + "Unified Canadian Aboriginal Syllabics" => UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, + "Ogham" => OGHAM, + "Runic" => RUNIC, + "Tagalog" => TAGALOG, + "Hanunoo" => HANUNOO, + "Buhid" => BUHID, + "Tagbanwa" => TAGBANWA, + "Khmer" => KHMER, + "Mongolian" => MONGOLIAN, + "Unified Canadian Aboriginal Syllabics Extended" => UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED, + "Limbu" => LIMBU, + "Tai Le" => TAI_LE, + "New Tai Lue" => NEW_TAI_LUE, + "Khmer Symbols" => KHMER_SYMBOLS, + "Buginese" => BUGINESE, + "Tai Tham" => TAI_THAM, + "Combining Diacritical Marks Extended" => COMBINING_DIACRITICAL_MARKS_EXTENDED, + "Balinese" => BALINESE, + "Sundanese" => SUNDANESE, + "Batak" => BATAK, + "Lepcha" => LEPCHA, + "Ol Chiki" => OL_CHIKI, + "Cyrillic Extended-C" => CYRILLIC_EXTENDED_C, + "Georgian Extended" => GEORGIAN_EXTENDED, + "Sundanese Supplement" => SUNDANESE_SUPPLEMENT, + "Vedic Extensions" => VEDIC_EXTENSIONS, + "Phonetic Extensions" => PHONETIC_EXTENSIONS, + "Phonetic Extensions Supplement" => PHONETIC_EXTENSIONS_SUPPLEMENT, + "Combining Diacritical Marks Supplement" => COMBINING_DIACRITICAL_MARKS_SUPPLEMENT, + "Latin Extended Additional" => LATIN_EXTENDED_ADDITIONAL, + "Greek Extended" => GREEK_EXTENDED, + "General Punctuation" => GENERAL_PUNCTUATION, + "Superscripts and Subscripts" => SUPERSCRIPTS_AND_SUBSCRIPTS, + "Currency Symbols" => CURRENCY_SYMBOLS, + "Combining Diacritical Marks for Symbols" => COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS, + "Letterlike Symbols" => LETTERLIKE_SYMBOLS, + "Number Forms" => NUMBER_FORMS, + "Arrows" => ARROWS, + "Mathematical Operators" => MATHEMATICAL_OPERATORS, + "Miscellaneous Technical" => MISCELLANEOUS_TECHNICAL, + "Control Pictures" => CONTROL_PICTURES, + "Optical Character Recognition" => OPTICAL_CHARACTER_RECOGNITION, + "Enclosed Alphanumerics" => ENCLOSED_ALPHANUMERICS, + "Box Drawing" => BOX_DRAWING, + "Block Elements" => BLOCK_ELEMENTS, + "Geometric Shapes" => GEOMETRIC_SHAPES, + "Miscellaneous Symbols" => MISCELLANEOUS_SYMBOLS, + "Dingbats" => DINGBATS, + "Miscellaneous Mathematical Symbols-A" => MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, + "Supplemental Arrows-A" => SUPPLEMENTAL_ARROWS_A, + "Braille Patterns" => BRAILLE_PATTERNS, + "Supplemental Arrows-B" => SUPPLEMENTAL_ARROWS_B, + "Miscellaneous Mathematical Symbols-B" => MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, + "Supplemental Mathematical Operators" => SUPPLEMENTAL_MATHEMATICAL_OPERATORS, + "Miscellaneous Symbols and Arrows" => MISCELLANEOUS_SYMBOLS_AND_ARROWS, + "Glagolitic" => GLAGOLITIC, + "Latin Extended-C" => LATIN_EXTENDED_C, + "Coptic" => COPTIC, + "Georgian Supplement" => GEORGIAN_SUPPLEMENT, + "Tifinagh" => TIFINAGH, + "Ethiopic Extended" => ETHIOPIC_EXTENDED, + "Cyrillic Extended-A" => CYRILLIC_EXTENDED_A, + "Supplemental Punctuation" => SUPPLEMENTAL_PUNCTUATION, + "CJK Radicals Supplement" => CJK_RADICALS_SUPPLEMENT, + "Kangxi Radicals" => KANGXI_RADICALS, + "Ideographic Description Characters" => IDEOGRAPHIC_DESCRIPTION_CHARACTERS, + "CJK Symbols and Punctuation" => CJK_SYMBOLS_AND_PUNCTUATION, + "Hiragana" => HIRAGANA, + "Katakana" => KATAKANA, + "Bopomofo" => BOPOMOFO, + "Hangul Compatibility Jamo" => HANGUL_COMPATIBILITY_JAMO, + "Kanbun" => KANBUN, + "Bopomofo Extended" => BOPOMOFO_EXTENDED, + "CJK Strokes" => CJK_STROKES, + "Katakana Phonetic Extensions" => KATAKANA_PHONETIC_EXTENSIONS, + "Enclosed CJK Letters and Months" => ENCLOSED_CJK_LETTERS_AND_MONTHS, + "CJK Compatibility" => CJK_COMPATIBILITY, + "CJK Unified Ideographs Extension A" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, + "Yijing Hexagram Symbols" => YIJING_HEXAGRAM_SYMBOLS, + "CJK Unified Ideographs" => CJK_UNIFIED_IDEOGRAPHS, + "Yi Syllables" => YI_SYLLABLES, + "Yi Radicals" => YI_RADICALS, + "Lisu" => LISU, + "Vai" => VAI, + "Cyrillic Extended-B" => CYRILLIC_EXTENDED_B, + "Bamum" => BAMUM, + "Modifier Tone Letters" => MODIFIER_TONE_LETTERS, + "Latin Extended-D" => LATIN_EXTENDED_D, + "Syloti Nagri" => SYLOTI_NAGRI, + "Common Indic Number Forms" => COMMON_INDIC_NUMBER_FORMS, + "Phags-pa" => PHAGS_PA, + "Saurashtra" => SAURASHTRA, + "Devanagari Extended" => DEVANAGARI_EXTENDED, + "Kayah Li" => KAYAH_LI, + "Rejang" => REJANG, + "Hangul Jamo Extended-A" => HANGUL_JAMO_EXTENDED_A, + "Javanese" => JAVANESE, + "Myanmar Extended-B" => MYANMAR_EXTENDED_B, + "Cham" => CHAM, + "Myanmar Extended-A" => MYANMAR_EXTENDED_A, + "Tai Viet" => TAI_VIET, + "Meetei Mayek Extensions" => MEETEI_MAYEK_EXTENSIONS, + "Ethiopic Extended-A" => ETHIOPIC_EXTENDED_A, + "Latin Extended-E" => LATIN_EXTENDED_E, + "Cherokee Supplement" => CHEROKEE_SUPPLEMENT, + "Meetei Mayek" => MEETEI_MAYEK, + "Hangul Syllables" => HANGUL_SYLLABLES, + "Hangul Jamo Extended-B" => HANGUL_JAMO_EXTENDED_B, + "Private Use Area" => PRIVATE_USE_AREA, + "CJK Compatibility Ideographs" => CJK_COMPATIBILITY_IDEOGRAPHS, + "Alphabetic Presentation Forms" => ALPHABETIC_PRESENTATION_FORMS, + "Arabic Presentation Forms-A" => ARABIC_PRESENTATION_FORMS_A, + "Variation Selectors" => VARIATION_SELECTORS, + "Vertical Forms" => VERTICAL_FORMS, + "Combining Half Marks" => COMBINING_HALF_MARKS, + "CJK Compatibility Forms" => CJK_COMPATIBILITY_FORMS, + "Small Form Variants" => SMALL_FORM_VARIANTS, + "Arabic Presentation Forms-B" => ARABIC_PRESENTATION_FORMS_B, + "Halfwidth and Fullwidth Forms" => HALFWIDTH_AND_FULLWIDTH_FORMS, + "Specials" => SPECIALS, + "Linear B Syllabary" => LINEAR_B_SYLLABARY, + "Linear B Ideograms" => LINEAR_B_IDEOGRAMS, + "Aegean Numbers" => AEGEAN_NUMBERS, + "Ancient Greek Numbers" => ANCIENT_GREEK_NUMBERS, + "Ancient Symbols" => ANCIENT_SYMBOLS, + "Phaistos Disc" => PHAISTOS_DISC, + "Lycian" => LYCIAN, + "Carian" => CARIAN, + "Coptic Epact Numbers" => COPTIC_EPACT_NUMBERS, + "Old Italic" => OLD_ITALIC, + "Gothic" => GOTHIC, + "Old Permic" => OLD_PERMIC, + "Ugaritic" => UGARITIC, + "Old Persian" => OLD_PERSIAN, + "Deseret" => DESERET, + "Shavian" => SHAVIAN, + "Osmanya" => OSMANYA, + "Osage" => OSAGE, + "Elbasan" => ELBASAN, + "Caucasian Albanian" => CAUCASIAN_ALBANIAN, + "Vithkuqi" => VITHKUQI, + "Linear A" => LINEAR_A, + "Latin Extended-F" => LATIN_EXTENDED_F, + "Cypriot Syllabary" => CYPRIOT_SYLLABARY, + "Imperial Aramaic" => IMPERIAL_ARAMAIC, + "Palmyrene" => PALMYRENE, + "Nabataean" => NABATAEAN, + "Hatran" => HATRAN, + "Phoenician" => PHOENICIAN, + "Lydian" => LYDIAN, + "Meroitic Hieroglyphs" => MEROITIC_HIEROGLYPHS, + "Meroitic Cursive" => MEROITIC_CURSIVE, + "Kharoshthi" => KHAROSHTHI, + "Old South Arabian" => OLD_SOUTH_ARABIAN, + "Old North Arabian" => OLD_NORTH_ARABIAN, + "Manichaean" => MANICHAEAN, + "Avestan" => AVESTAN, + "Inscriptional Parthian" => INSCRIPTIONAL_PARTHIAN, + "Inscriptional Pahlavi" => INSCRIPTIONAL_PAHLAVI, + "Psalter Pahlavi" => PSALTER_PAHLAVI, + "Old Turkic" => OLD_TURKIC, + "Old Hungarian" => OLD_HUNGARIAN, + "Hanifi Rohingya" => HANIFI_ROHINGYA, + "Rumi Numeral Symbols" => RUMI_NUMERAL_SYMBOLS, + "Yezidi" => YEZIDI, + "Arabic Extended-C" => ARABIC_EXTENDED_C, + "Old Sogdian" => OLD_SOGDIAN, + "Sogdian" => SOGDIAN, + "Old Uyghur" => OLD_UYGHUR, + "Chorasmian" => CHORASMIAN, + "Elymaic" => ELYMAIC, + "Brahmi" => BRAHMI, + "Kaithi" => KAITHI, + "Sora Sompeng" => SORA_SOMPENG, + "Chakma" => CHAKMA, + "Mahajani" => MAHAJANI, + "Sharada" => SHARADA, + "Sinhala Archaic Numbers" => SINHALA_ARCHAIC_NUMBERS, + "Khojki" => KHOJKI, + "Multani" => MULTANI, + "Khudawadi" => KHUDAWADI, + "Grantha" => GRANTHA, + "Newa" => NEWA, + "Tirhuta" => TIRHUTA, + "Siddham" => SIDDHAM, + "Modi" => MODI, + "Mongolian Supplement" => MONGOLIAN_SUPPLEMENT, + "Takri" => TAKRI, + "Ahom" => AHOM, + "Dogra" => DOGRA, + "Warang Citi" => WARANG_CITI, + "Dives Akuru" => DIVES_AKURU, + "Nandinagari" => NANDINAGARI, + "Zanabazar Square" => ZANABAZAR_SQUARE, + "Soyombo" => SOYOMBO, + "Unified Canadian Aboriginal Syllabics Extended-A" => UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A, + "Pau Cin Hau" => PAU_CIN_HAU, + "Devanagari Extended-A" => DEVANAGARI_EXTENDED_A, + "Bhaiksuki" => BHAIKSUKI, + "Marchen" => MARCHEN, + "Masaram Gondi" => MASARAM_GONDI, + "Gunjala Gondi" => GUNJALA_GONDI, + "Makasar" => MAKASAR, + "Kawi" => KAWI, + "Lisu Supplement" => LISU_SUPPLEMENT, + "Tamil Supplement" => TAMIL_SUPPLEMENT, + "Cuneiform" => CUNEIFORM, + "Cuneiform Numbers and Punctuation" => CUNEIFORM_NUMBERS_AND_PUNCTUATION, + "Early Dynastic Cuneiform" => EARLY_DYNASTIC_CUNEIFORM, + "Cypro-Minoan" => CYPRO_MINOAN, + "Egyptian Hieroglyphs" => EGYPTIAN_HIEROGLYPHS, + "Egyptian Hieroglyph Format Controls" => EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS, + "Anatolian Hieroglyphs" => ANATOLIAN_HIEROGLYPHS, + "Bamum Supplement" => BAMUM_SUPPLEMENT, + "Mro" => MRO, + "Tangsa" => TANGSA, + "Bassa Vah" => BASSA_VAH, + "Pahawh Hmong" => PAHAWH_HMONG, + "Medefaidrin" => MEDEFAIDRIN, + "Miao" => MIAO, + "Ideographic Symbols and Punctuation" => IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION, + "Tangut" => TANGUT, + "Tangut Components" => TANGUT_COMPONENTS, + "Khitan Small Script" => KHITAN_SMALL_SCRIPT, + "Tangut Supplement" => TANGUT_SUPPLEMENT, + "Kana Extended-B" => KANA_EXTENDED_B, + "Kana Supplement" => KANA_SUPPLEMENT, + "Kana Extended-A" => KANA_EXTENDED_A, + "Small Kana Extension" => SMALL_KANA_EXTENSION, + "Nushu" => NUSHU, + "Duployan" => DUPLOYAN, + "Shorthand Format Controls" => SHORTHAND_FORMAT_CONTROLS, + "Znamenny Musical Notation" => ZNAMENNY_MUSICAL_NOTATION, + "Byzantine Musical Symbols" => BYZANTINE_MUSICAL_SYMBOLS, + "Musical Symbols" => MUSICAL_SYMBOLS, + "Ancient Greek Musical Notation" => ANCIENT_GREEK_MUSICAL_NOTATION, + "Kaktovik Numerals" => KAKTOVIK_NUMERALS, + "Mayan Numerals" => MAYAN_NUMERALS, + "Tai Xuan Jing Symbols" => TAI_XUAN_JING_SYMBOLS, + "Counting Rod Numerals" => COUNTING_ROD_NUMERALS, + "Mathematical Alphanumeric Symbols" => MATHEMATICAL_ALPHANUMERIC_SYMBOLS, + "Sutton SignWriting" => SUTTON_SIGNWRITING, + "Latin Extended-G" => LATIN_EXTENDED_G, + "Glagolitic Supplement" => GLAGOLITIC_SUPPLEMENT, + "Cyrillic Extended-D" => CYRILLIC_EXTENDED_D, + "Nyiakeng Puachue Hmong" => NYIAKENG_PUACHUE_HMONG, + "Toto" => TOTO, + "Wancho" => WANCHO, + "Nag Mundari" => NAG_MUNDARI, + "Ethiopic Extended-B" => ETHIOPIC_EXTENDED_B, + "Mende Kikakui" => MENDE_KIKAKUI, + "Adlam" => ADLAM, + "Indic Siyaq Numbers" => INDIC_SIYAQ_NUMBERS, + "Ottoman Siyaq Numbers" => OTTOMAN_SIYAQ_NUMBERS, + "Arabic Mathematical Alphabetic Symbols" => ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS, + "Mahjong Tiles" => MAHJONG_TILES, + "Domino Tiles" => DOMINO_TILES, + "Playing Cards" => PLAYING_CARDS, + "Enclosed Alphanumeric Supplement" => ENCLOSED_ALPHANUMERIC_SUPPLEMENT, + "Enclosed Ideographic Supplement" => ENCLOSED_IDEOGRAPHIC_SUPPLEMENT, + "Miscellaneous Symbols and Pictographs" => MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS, + "Emoticons" => EMOTICONS, + "Ornamental Dingbats" => ORNAMENTAL_DINGBATS, + "Transport and Map Symbols" => TRANSPORT_AND_MAP_SYMBOLS, + "Alchemical Symbols" => ALCHEMICAL_SYMBOLS, + "Geometric Shapes Extended" => GEOMETRIC_SHAPES_EXTENDED, + "Supplemental Arrows-C" => SUPPLEMENTAL_ARROWS_C, + "Supplemental Symbols and Pictographs" => SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS, + "Chess Symbols" => CHESS_SYMBOLS, + "Symbols and Pictographs Extended-A" => SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A, + "Symbols for Legacy Computing" => SYMBOLS_FOR_LEGACY_COMPUTING, + "CJK Unified Ideographs Extension B" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, + "CJK Unified Ideographs Extension C" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C, + "CJK Unified Ideographs Extension D" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D, + "CJK Unified Ideographs Extension E" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E, + "CJK Unified Ideographs Extension F" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F, + "CJK Unified Ideographs Extension I" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I, + "CJK Compatibility Ideographs Supplement" => CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, + "CJK Unified Ideographs Extension G" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G, + "CJK Unified Ideographs Extension H" => CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H, + "Tags" => TAGS, + "Variation Selectors Supplement" => VARIATION_SELECTORS_SUPPLEMENT, + "Supplementary Private Use Area-A" => SUPPLEMENTARY_PRIVATE_USE_AREA_A, + "Supplementary Private Use Area-B" => SUPPLEMENTARY_PRIVATE_USE_AREA_B, +};