-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate-unicode-blocks-consts
executable file
·60 lines (50 loc) · 1.78 KB
/
generate-unicode-blocks-consts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
"""
Download Unicode block data and generates rust constants from it.
"""
import pathlib
import re
import sys
import urllib.request
BLOCKDEF = re.compile(
r"^(?P<low>[0-9A-Fa-f]+)\.\.(?P<high>[0-9A-Fa-f]+); (?P<name>.*)$"
)
TXTBLOCKFILE = pathlib.Path("UNIDATA/Blocks.txt")
RSBLOCKFILE = pathlib.Path("src/unicode_blocks.rs")
resp = urllib.request.urlopen("https://www.unicode.org/Public/UNIDATA/Blocks.txt")
blocksdata = resp.read()
TXTBLOCKFILE.write_bytes(blocksdata)
blocks = []
for line in blocksdata.decode().splitlines():
if match := BLOCKDEF.match(line.strip()):
name = match.group("name")
if name in {"Low Surrogates", "High Surrogates", "High Private Use Surrogates"}:
# Surrogate code points are not valid chars in rust
continue
low = match.group("low")
high = match.group("high")
blocks.append((name, low, high))
def constname(blockname):
return blockname.replace(" ", "_").replace("-", "_").upper()
with RSBLOCKFILE.open("w") as file:
print("#![cfg_attr(rustfmt, rustfmt_skip)]", file=file)
print(
f"// Code generated by {sys.argv[0]}. DO NOT EDIT.",
file=file,
)
print(file=file)
for name, low, high in blocks:
rustrange = f"'\\u{{{low}}}'..='\\u{{{high}}}'"
print(
f"pub const {constname(name)}: std::ops::RangeInclusive<char> = {rustrange};",
file=file,
)
print(
"""
/// UNICODE_BLOCKS is a mapping from the pretty block name to the character range.
pub static UNICODE_BLOCKS: phf::Map<&'static str, std::ops::RangeInclusive<char>> = phf::phf_map! {""",
file=file,
)
for name, _, _ in blocks:
print(f' "{name}" => {constname(name)},', file=file)
print("};", file=file)