-
Notifications
You must be signed in to change notification settings - Fork 1
/
io.go
172 lines (142 loc) · 3.77 KB
/
io.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
package corpus
import (
"bufio"
"bytes"
"encoding/gob"
"io"
"strconv"
"strings"
"github.com/pkg/errors"
)
// sortutil is a utility struct meant to sort words based on IDs
type sortutil struct {
words []string
ids []int
freqs []int
}
func (s *sortutil) Len() int { return len(s.words) }
func (s *sortutil) Less(i, j int) bool { return s.ids[i] < s.ids[j] }
func (s *sortutil) Swap(i, j int) {
s.words[i], s.words[j] = s.words[j], s.words[i]
s.ids[i], s.ids[j] = s.ids[j], s.ids[i]
if len(s.freqs) > 0 {
s.freqs[i], s.freqs[j] = s.freqs[j], s.freqs[i]
}
}
// ToDictWithFreq returns a simple marshalable type. Conceptually it's a JSON object with the words as the keys. The values are a pair - ID and Freq.
func ToDictWithFreq(c *Corpus) map[string]struct{ ID, Freq int } {
retVal := make(map[string]struct{ ID, Freq int })
for i, w := range c.words {
retVal[w] = struct{ ID, Freq int }{i, c.frequencies[i]}
}
return retVal
}
// ToDict returns a marshalable dict. It returns a copy of the ID mapping.
func ToDict(c *Corpus) map[string]int {
retVal := make(map[string]int)
for k, v := range c.ids {
retVal[k] = v
}
return retVal
}
// GobEncode implements GobEncoder for *Corpus
func (c *Corpus) GobEncode() ([]byte, error) {
var buf bytes.Buffer
encoder := gob.NewEncoder(&buf)
if err := encoder.Encode(c.words); err != nil {
return nil, err
}
if err := encoder.Encode(c.ids); err != nil {
return nil, err
}
if err := encoder.Encode(c.frequencies); err != nil {
return nil, err
}
if err := encoder.Encode(c.maxid); err != nil {
return nil, err
}
if err := encoder.Encode(c.totalFreq); err != nil {
return nil, err
}
if err := encoder.Encode(c.maxWordLength); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
// GobDecode implements GobDecoder for *Corpus
func (c *Corpus) GobDecode(buf []byte) error {
b := bytes.NewBuffer(buf)
decoder := gob.NewDecoder(b)
if err := decoder.Decode(&c.words); err != nil {
return err
}
if err := decoder.Decode(&c.ids); err != nil {
return err
}
if err := decoder.Decode(&c.frequencies); err != nil {
return err
}
if err := decoder.Decode(&c.maxid); err != nil {
return err
}
if err := decoder.Decode(&c.totalFreq); err != nil {
return err
}
if err := decoder.Decode(&c.maxWordLength); err != nil {
return err
}
return nil
}
// LoadOneGram loads a 1_gram.txt file, which is a tab separated file which lists the frequency counts of words. Example:
// the 23135851162
// of 13151942776
// and 12997637966
// to 12136980858
// a 9081174698
// in 8469404971
// for 5933321709
func (c *Corpus) LoadOneGram(r io.Reader) error {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := scanner.Text()
splits := strings.Split(line, "\t")
if len(splits) == 0 {
break
}
word := splits[0] // TODO: normalize
count, err := strconv.Atoi(splits[1])
if err != nil {
return err
}
id := c.Add(word)
c.frequencies[id] = count
c.totalFreq--
c.totalFreq += count
wc := len([]rune(word))
if wc > c.maxWordLength {
c.maxWordLength = wc
}
}
return nil
}
// FromTextCorpus is a utility function to take in a text file, and return a Corpus.
func FromTextCorpus(r io.Reader, tokenizer func(a string) []string, normalizer func(a string) string) (*Corpus, error) {
if tokenizer == nil {
tokenizer = func(a string) []string {
return strings.Split(strings.Trim(a, "\r\n "), " ")
}
}
if normalizer == nil {
normalizer = func(a string) string { return a }
}
var words []string
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := scanner.Text()
words = append(words, tokenizer(normalizer(line))...)
}
if err := scanner.Err(); err != nil {
return nil, errors.Wrap(err, "Unable to read from text corpus")
}
return Construct(WithWords(words))
}