-
Notifications
You must be signed in to change notification settings - Fork 0
/
token_writer_test.go
109 lines (73 loc) · 2.37 KB
/
token_writer_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
package datok
import (
"bytes"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestTokenWriterSimple(t *testing.T) {
assert := assert.New(t)
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
tws := NewTokenWriter(w, SIMPLE)
assert.NotNil(tws)
tws.Token(0, []rune{'a', 'b', 'c'})
tws.Token(1, []rune{'d', 'e', 'f'})
tws.SentenceEnd(0)
tws.TextEnd(0)
tws.Flush()
assert.Equal("abc\nef\n\n\n", w.String())
}
func TestTokenWriterFromOptions(t *testing.T) {
assert := assert.New(t)
mat := LoadMatrixFile("testdata/tokenizer_de.matok")
assert.NotNil(mat)
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
tws := NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS)
assert.True(mat.TransduceTokenWriter(
strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
)
matStr := w.String()
assert.Equal("This\n.\n\n0 4 4 5\nAnd\n.\n\n0 3 3 4\n", matStr)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
matStr = w.String()
assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n1 4 4 5\n", matStr)
//
// Accept newline after EOT
tws = NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
matStr = w.String()
assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n0 3 3 4\n", matStr)
//
// Write no tokens
tws = NewTokenWriter(w, SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
matStr = w.String()
assert.Equal("\n1 5 5 6\n\n0 3 3 4\n", matStr)
//
// Write sentence offsets
tws = NewTokenWriter(w, TOKEN_POS|SENTENCE_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
matStr = w.String()
assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("Tree\n\x04\n"), tws)
matStr = w.String()
assert.Equal("0 4\n0 4\n", matStr)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("Tree.\n\x04\n"), tws)
matStr = w.String()
assert.Equal("0 4 4 5\n0 5\n", matStr)
//
// Write sentence offsets without token offsets
tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
matStr = w.String()
assert.Equal("1 6\n0 4\n", matStr)
}