forked from unidoc/unitype
-
Notifications
You must be signed in to change notification settings - Fork 0
/
encoding.go
198 lines (178 loc) · 6.64 KB
/
encoding.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package unitype
import (
"encoding/binary"
"unicode/utf8"
"github.com/sirupsen/logrus"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/encoding/unicode/utf32"
)
// The 258 standard mac glyph names used in 'post' format 1 and 2.
// https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6post.html
var macGlyphNames = []GlyphName{
".notdef", ".null", "nonmarkingreturn", "space", "exclam", "quotedbl",
"numbersign", "dollar", "percent", "ampersand", "quotesingle",
"parenleft", "parenright", "asterisk", "plus", "comma", "hyphen",
"period", "slash", "zero", "one", "two", "three", "four", "five",
"six", "seven", "eight", "nine", "colon", "semicolon", "less",
"equal", "greater", "question", "at", "A", "B", "C", "D", "E", "F",
"G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S",
"T", "U", "V", "W", "X", "Y", "Z", "bracketleft", "backslash",
"bracketright", "asciicircum", "underscore", "grave", "a", "b",
"c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "braceleft",
"bar", "braceright", "asciitilde", "Adieresis", "Aring",
"Ccedilla", "Eacute", "Ntilde", "Odieresis", "Udieresis", "aacute",
"agrave", "acircumflex", "adieresis", "atilde", "aring",
"ccedilla", "eacute", "egrave", "ecircumflex", "edieresis",
"iacute", "igrave", "icircumflex", "idieresis", "ntilde", "oacute",
"ograve", "ocircumflex", "odieresis", "otilde", "uacute", "ugrave",
"ucircumflex", "udieresis", "dagger", "degree", "cent", "sterling",
"section", "bullet", "paragraph", "germandbls", "registered",
"copyright", "trademark", "acute", "dieresis", "notequal", "AE",
"Oslash", "infinity", "plusminus", "lessequal", "greaterequal",
"yen", "mu", "partialdiff", "summation", "product", "pi",
"integral", "ordfeminine", "ordmasculine", "Omega", "ae", "oslash",
"questiondown", "exclamdown", "logicalnot", "radical", "florin",
"approxequal", "Delta", "guillemotleft", "guillemotright",
"ellipsis", "nonbreakingspace", "Agrave", "Atilde", "Otilde", "OE",
"oe", "endash", "emdash", "quotedblleft", "quotedblright",
"quoteleft", "quoteright", "divide", "lozenge", "ydieresis",
"Ydieresis", "fraction", "currency", "guilsinglleft",
"guilsinglright", "fi", "fl", "daggerdbl", "periodcentered",
"quotesinglbase", "quotedblbase", "perthousand", "Acircumflex",
"Ecircumflex", "Aacute", "Edieresis", "Egrave", "Iacute",
"Icircumflex", "Idieresis", "Igrave", "Oacute", "Ocircumflex",
"apple", "Ograve", "Uacute", "Ucircumflex", "Ugrave", "dotlessi",
"circumflex", "tilde", "macron", "breve", "dotaccent", "ring",
"cedilla", "hungarumlaut", "ogonek", "caron", "Lslash", "lslash",
"Scaron", "scaron", "Zcaron", "zcaron", "brokenbar", "Eth", "eth",
"Yacute", "yacute", "Thorn", "thorn", "minus", "multiply",
"onesuperior", "twosuperior", "threesuperior", "onehalf",
"onequarter", "threequarters", "franc", "Gbreve", "gbreve",
"Idotaccent", "Scedilla", "scedilla", "Cacute", "cacute", "Ccaron",
"ccaron", "dcroat",
}
const (
platformIDUnicode int = 0
platformIDMacintosh = 1
platformIDWindows = 3
)
// getCmapEncoding returns the cmapEncoding for the specified `platformID` and platform-specific `encodingID`.
func getCmapEncoding(platformID, encodingID int) cmapEncoding {
switch platformID {
case platformIDUnicode:
return cmapEncodingUCS2
case platformIDMacintosh:
return cmapEncodingMacRoman
case platformIDWindows:
switch encodingID {
case 0: // Symbol
// TODO(gunnsth): Is this correct for symbol?
return cmapEncodingUCS2
case 1: // Unicode BMP-only (UCS-2)
return cmapEncodingUCS2
case 2: // Shift-JIS
return cmapEncodingShiftJIS
case 3: // PRC
return cmapEncodingPRC
case 4: // BigFive
return cmapEncodingBig5
case 5: // Johab.
return cmapEncodingJohab
case 10: // Unicode UCS-4.
return cmapEncodingUCS4
}
}
logrus.Debugf("Unsupported: PlatformID=%d, EncodingID=%d", platformID, encodingID)
return cmapEncodingUnsupported
}
type cmapEncoding int
const (
cmapEncodingUCS2 cmapEncoding = iota
cmapEncodingUCS4
cmapEncodingMacRoman
cmapEncodingShiftJIS
cmapEncodingPRC
cmapEncodingBig5
cmapEncodingJohab
cmapEncodingUnsupported
)
// GetRuneDecoder returns a rune decoder for the given cmapEncoding.
// TODO(gunnsth): Combine this and getCmapEncoding into a single function?
func (e cmapEncoding) GetRuneDecoder() runeDecoder {
var d *encoding.Decoder
var charcodeBytes int
switch e {
case cmapEncodingUCS2:
// UCS2 is a subset of UTF16.
// Typically is big endian, although it is not specified explicitly in the specifications.
d = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder()
charcodeBytes = 2
case cmapEncodingUCS4:
// UCS4 is a subset of UTF32.
d = utf32.UTF32(utf32.BigEndian, utf32.IgnoreBOM).NewDecoder()
charcodeBytes = 4
case cmapEncodingMacRoman:
d = charmap.Macintosh.NewDecoder()
charcodeBytes = 1
case cmapEncodingShiftJIS:
d = japanese.ShiftJIS.NewDecoder()
charcodeBytes = 2
case cmapEncodingPRC:
d = simplifiedchinese.GBK.NewDecoder()
charcodeBytes = 2
case cmapEncodingBig5:
d = traditionalchinese.Big5.NewDecoder()
charcodeBytes = 2
}
if d == nil {
logrus.Debugf("ERROR: Unsupported encoding (%d) - returning charcodes as runes", e)
d = unicode.UTF8.NewDecoder()
charcodeBytes = 1
}
return runeDecoder{
Decoder: d,
charcodeBytes: charcodeBytes,
}
}
// runeDecoder decodes runes from encoded byte data.
type runeDecoder struct {
*encoding.Decoder
charcodeBytes int // number of bytes per charcode in TTF data.
}
// ToBytes encodes `charcode` into bytes as represented in TTF data.
func (d runeDecoder) ToBytes(charcode uint32) []byte {
b := make([]byte, d.charcodeBytes)
switch d.charcodeBytes {
case 1:
b[0] = byte(charcode)
case 2:
binary.BigEndian.PutUint16(b, uint16(charcode))
case 4:
binary.BigEndian.PutUint32(b, charcode)
default:
logrus.Debugf("ERROR: Unsupported number of bytes per charcode: %d", d.charcodeBytes)
return []byte{0}
}
return b
}
// DecodeRune decodes character codes in `b` and returns the decode rune.
func (d runeDecoder) DecodeRune(b []byte) rune {
// Get decoded bytes (the decoder decodes to UTF8 byte format).
decoded, err := d.Bytes(b)
if err != nil {
logrus.Debugf("Decoding error: %v", err)
}
// TODO(gunnsth): Benchmark utf8.DecodeRune vs string().
r, _ := utf8.DecodeRune(decoded)
return r
}