forked from pkoukk/tiktoken-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtiktoken.go
123 lines (106 loc) · 3.04 KB
/
tiktoken.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
package tiktoken
import (
"fmt"
"regexp"
"strings"
"github.com/dlclark/regexp2"
)
var bpeLoader BpeLoader = NewDefaultBpeLoader()
func SetBpeLoader(loader BpeLoader) {
bpeLoader = loader
}
func GetEncoding(encodingName string) (*Tiktoken, error) {
enc, err := getEncoding(encodingName)
if err != nil {
return nil, err
}
pbe, err := NewCoreBPE(enc.MergeableRanks, enc.SpecialTokens, enc.PatStr)
if err != nil {
return nil, err
}
specialTokensSet := map[string]any{}
for k := range enc.SpecialTokens {
specialTokensSet[k] = true
}
return &Tiktoken{
bpe: pbe,
pbeEncoding: enc,
specialTokensSet: specialTokensSet,
}, nil
}
func EncodingForModel(modelName string) (*Tiktoken, error) {
if encodingName, ok := MODEL_TO_ENCODING[modelName]; ok {
return GetEncoding(encodingName)
} else {
for prefix, encodingName := range MODEL_PREFIX_TO_ENCODING {
if strings.HasPrefix(modelName, prefix) {
return GetEncoding(encodingName)
}
}
}
return nil, fmt.Errorf("no encoding for model %s", modelName)
}
type Tiktoken struct {
bpe *CoreBPE
pbeEncoding *Encoding
specialTokensSet map[string]any
}
func (t *Tiktoken) Encode(text string, allowedSpecial []string, disallowedSpecial []string) []int {
var allowedSpecialSet map[string]any
if len(allowedSpecial) == 0 {
allowedSpecialSet = map[string]any{}
} else if len(allowedSpecial) == 1 && allowedSpecial[0] == "all" {
allowedSpecialSet = t.specialTokensSet
} else {
allowedSpecialSet = map[string]any{}
for _, v := range allowedSpecial {
allowedSpecialSet[v] = nil
}
}
disallowedSpecialSet := map[string]any{}
for _, v := range disallowedSpecial {
disallowedSpecialSet[v] = nil
}
if len(disallowedSpecial) == 1 && disallowedSpecial[0] == "all" {
disallowedSpecialSet = difference(t.specialTokensSet, allowedSpecialSet)
}
if len(disallowedSpecialSet) > 0 {
specialRegex := t.SpecialTokenRegex(disallowedSpecialSet)
m := findRegex2StringMatch(text, specialRegex)
if m != "" {
panic(fmt.Sprintf("text contains disallowed special token %s", m))
}
}
tokens, _ := t.bpe.encodeNative(text, allowedSpecialSet)
return tokens
}
func (t *Tiktoken) EncodeOrdinary(text string) []int {
return (t.bpe.encodeOrdinaryNative(text))
}
func (t *Tiktoken) Decode(tokens []int) string {
return string(t.bpe.decodeNative(tokens))
}
func (t *Tiktoken) SpecialTokenRegex(disallowedSpecialSet map[string]any) *regexp2.Regexp {
specialRegexStrs := make([]string, 0, len(disallowedSpecialSet))
for k := range disallowedSpecialSet {
specialRegexStrs = append(specialRegexStrs, regexp.QuoteMeta(k))
}
specialRegex := regexp2.MustCompile(strings.Join(specialRegexStrs, "|"), regexp2.None)
return specialRegex
}
func findRegex2StringMatch(text string, reg *regexp2.Regexp) string {
m, _ := reg.FindStringMatch(text)
if m == nil {
return ""
}
return m.String()
}
func difference(setA, setB map[string]any) map[string]any {
result := make(map[string]any)
for k := range setA {
if _, ok := setB[k]; !ok {
result[k] = true
}
}
return result
}