-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcorpus.go
197 lines (164 loc) · 4.7 KB
/
corpus.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
package corpus
import (
"sync/atomic"
"unicode/utf8"
"github.com/pkg/errors"
)
// Corpus is a data structure holding the relevant metadata and information for a corpus of text.
// It serves as vocabulary with ID for lookup. This is very useful as neural networks rely on the IDs rather than the text themselves
type Corpus struct {
words []string
frequencies []int
ids map[string]int
// atomic read and write plz
maxid int64
totalFreq int
maxWordLength int
}
// New creates a new *Corpus
func New() *Corpus {
c := &Corpus{
words: make([]string, 0),
frequencies: make([]int, 0),
ids: make(map[string]int),
}
// add some default words
c.Add("") // aka NULL - when there are no words
c.Add("-UNKNOWN-")
c.Add("-ROOT-")
c.maxWordLength = 0 // specials don't have lengths
return c
}
// Construct creates a Corpus given the construction options. This allows for more flexibility
func Construct(opts ...ConsOpt) (*Corpus, error) {
c := new(Corpus)
// checks
if c.words == nil {
c.words = make([]string, 0)
}
if c.frequencies == nil {
c.frequencies = make([]int, 0)
}
if c.ids == nil {
c.ids = make(map[string]int)
}
for _, opt := range opts {
if err := opt(c); err != nil {
return nil, err
}
}
return c, nil
}
// ID returns the ID of a word and whether or not it was found in the corpus
func (c *Corpus) Id(word string) (int, bool) {
id, ok := c.ids[word]
return id, ok
}
// Word returns the word given the ID, and whether or not it was found in the corpus
func (c *Corpus) Word(id int) (string, bool) {
size := atomic.LoadInt64(&c.maxid)
maxid := int(size)
if id >= maxid {
return "", false
}
return c.words[id], true
}
// Add adds a word to the corpus and returns its ID. If a word was previously in the corpus, it merely updates the frequency count and returns the ID
func (c *Corpus) Add(word string) int {
if id, ok := c.ids[word]; ok {
c.frequencies[id]++
c.totalFreq++
return id
}
id := atomic.AddInt64(&c.maxid, 1)
c.ids[word] = int(id - 1)
c.words = append(c.words, word)
c.frequencies = append(c.frequencies, 1)
c.totalFreq++
runeCount := utf8.RuneCountInString(word)
if runeCount > c.maxWordLength {
c.maxWordLength = runeCount
}
return int(id - 1)
}
// Size returns the size of the corpus.
func (c *Corpus) Size() int {
size := atomic.LoadInt64(&c.maxid)
return int(size)
}
// WordFreq returns the frequency of the word. If the word wasn't in the corpus, it returns 0.
func (c *Corpus) WordFreq(word string) int {
id, ok := c.ids[word]
if !ok {
return 0
}
return c.frequencies[id]
}
// IDFreq returns the frequency of a word given an ID. If the word isn't in the corpus it returns 0.
func (c *Corpus) IDFreq(id int) int {
size := atomic.LoadInt64(&c.maxid)
maxid := int(size)
if id >= maxid {
return 0
}
return c.frequencies[id]
}
// TotalFreq returns the total number of words ever seen by the corpus. This number includes the count of repeat words.
func (c *Corpus) TotalFreq() int {
return c.totalFreq
}
// MaxWordLength returns the length of the longest known word in the corpus.
func (c *Corpus) MaxWordLength() int {
return c.maxWordLength
}
// WordProb returns the probability of a word appearing in the corpus.
func (c *Corpus) WordProb(word string) (float64, bool) {
id, ok := c.Id(word)
if !ok {
return 0, false
}
count := c.frequencies[id]
return float64(count) / float64(c.totalFreq), true
}
// Merge combines two corpuses. The receiver is the one that is mutated.
func (c *Corpus) Merge(other *Corpus) {
for i, word := range other.words {
freq := other.frequencies[i]
if id, ok := c.ids[word]; ok {
c.frequencies[id] += freq
c.totalFreq += freq
} else {
id := c.Add(word)
c.frequencies[id] += freq - 1
c.totalFreq += freq - 1
}
}
}
// Replace replaces the content of a word. The old reference remains.
//
// e.g: c.Replace("foo", "bar")
// c.Id("foo") will still return a ID. The ID will be the same as c.Id("bar")
func (c *Corpus) Replace(a, with string) error {
old, ok := c.ids[a]
if !ok {
return errors.Errorf("Cannot replace %q with %q. %q is not found", a, with, a)
}
if _, ok := c.ids[with]; ok {
return errors.Errorf("Cannot replace %q with %q. %q exists in the corpus", a, with, with)
}
c.words[old] = with
c.ids[with] = old
return nil
}
// ReplaceWord replaces the word associated with the given ID. The old reference remains.
func (c *Corpus) ReplaceWord(id int, with string) error {
if id >= len(c.words) {
return errors.Errorf("Cannot replace word with ID %d. Out of bounds.", id)
}
if _, ok := c.ids[with]; ok {
return errors.Errorf("Cannot replace word with ID %d with %q. %q exists in the corpus", id, with, with)
}
c.words[id] = with
c.ids[with] = id
return nil
}