-
Notifications
You must be signed in to change notification settings - Fork 1
/
cqt.py
137 lines (122 loc) · 3.99 KB
/
cqt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Reference code written from scratch by Daniel Hardman, and released
# under the Apache 2.0 license.
import re
import unicodedata
AMPERS_PAT = re.compile(r'[&\uFE60\uFF06]')
SPECIALIZED_WHITESPACE_PAT = re.compile(r'[\u2028\u2029\u200B\uFEFF\u00A0\u3000\r\n\t]+')
MULTI_WHITESPACE_PAT = re.compile(r'\s{2,}')
DASHCHARS_PAT = re.compile(r'[\u058A\u05BE\u1400\u1806\u2010\u2011\u2012\u2013\u2014\u2015\u2e17\u2e1a\u2e3a\u2e3b\u2e40\u2e5d\u301c\u3030\u30a0\ufe31\ufe32\ufe58\ufe63\uff0d]+')
MULTI_HYPHENS_PAT = re.compile(r'-{2,}')
CJK_PUNCT_PAIRS = [
("\u3001", ","),
("\u3002", ".")
]
LONG_DOTS_PAT = re.compile(r'[.]{4,}')
QUOTERS_PAT = re.compile(r'["\u2018\u2019\u201C\u201D\u00AB\u00BB\u2039\u203A\u3008\u3009\u300A\u300B\u300C\u300D]')
ANY_WHITESPACE_PAT = re.compile(r'(\s+)')
AUTOCORRECT_PAIRS = [
("\u1f60A", ":-)"),
("\u1f610", ":-|"),
("\u2639", ":-("),
("\u1f603", ":-D"),
("\u1f61D", ":-p"),
("\u1f632", ":-o"),
("\u1f609", ";-)"),
("\u2764", "<3"),
("\u1f494", "</3"),
("\u00a9", "(c)"),
("\u00ae", "(R)"),
("\u2022", "*")
]
ASCII_EMOJI_PAIRS = [
(":)", ":-)"),
(":|", ":-|"),
(":(", ":-("),
(":D", ":-D"),
(":p", ":-p"),
(":o", ":-o"),
(";)", ";-)")
]
def algorithm_1_14(plaintext):
# We start with step 1 already complete, since Python 3 strings are already unicode.
# step 2
x = unicodedata.normalize('NFKC', plaintext)
def step3(ampersands):
return AMPERS_PAT.sub(' and ', ampersands)
x = step3(x)
def step4(whitespace_anomalies):
out = SPECIALIZED_WHITESPACE_PAT.sub(' ', whitespace_anomalies)
out = out.strip()
out = MULTI_WHITESPACE_PAT.sub(' ', out)
return out
x = step4(x)
def step5(punct_anomalies):
# 5.i
out = DASHCHARS_PAT.sub('-', punct_anomalies)
# 5.ii
out = MULTI_HYPHENS_PAT.sub('-', out)
# 5.iii
for cjk, ascii in CJK_PUNCT_PAIRS:
out = out.replace(cjk, ascii)
txt = ''
for c in out:
n = ord(c)
if 0xFF01 <= n <= 0xFF5E:
c = n - 0xFEE0
txt += c
# 5.iv
out = txt.replace('\u2026', '...')
# 5.v
out = LONG_DOTS_PAT.sub('...', out)
# 5.vi
out = out.replace('\u2044', '/')
# 5.vii
out = QUOTERS_PAT.sub("'", out)
txt = ''
next = 0
# 5.viii
for match in ANY_WHITESPACE_PAT.finditer(out):
keep_space = True
i = match.start()
if i > 0:
txt += out[next:i]
if unicodedata.category(out[i - 1]).startswith('P'):
keep_space = False
next = match.end()
if next < len(out):
if unicodedata.category(out[next]).startswith('P'):
keep_space = False
if keep_space:
txt += match.group(1)
txt += out[next:]
out = txt
# 5.ix
for autocorrect, ascii in AUTOCORRECT_PAIRS:
out = out.replace(autocorrect, ascii)
# 5.x
for noncanonical, canonical in ASCII_EMOJI_PAIRS:
out = out.replace(noncanonical, canonical)
return txt
x = step5(x)
# step 6
return x.encode("UTF-8")
if __name__ == '__main__':
from blake3 import blake3
import base64
print("Enter some text (two blank lines to end): ")
lines = []
blank_count = 0
while True:
line = input().strip()
if not line:
blank_count += 1
if blank_count > 1:
break
else:
blank_count = 0
lines.append(line)
cqt = algorithm_1_14("\n".join(lines))
cqt_txt = cqt.decode("UTF-8")
print(f"Canonical quoted text = {cqt_txt}")
hash = base64.urlsafe_b64encode(blake3(cqt).digest()).decode("ASCII")
print(f"hash = {hash}")