forked from kootenpv/neural_complete
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencoder_decoder.py
124 lines (102 loc) · 4.06 KB
/
encoder_decoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from collections import Counter
import numpy as np
import tokenize as tk
from io import BytesIO
def text_tokenize(txt):
""" specific tokenizer suitable for extracting 'python tokens' """
toks = []
try:
for x in tk.tokenize(BytesIO(txt.encode('utf-8')).readline):
toks.append(x)
except tk.TokenError:
pass
tokkies = []
old = (0, 0)
for t in toks:
if not t.string:
continue
if t.start[0] == old[0] and t.start[1] > old[1]:
tokkies.append(" " * (t.start[1] - old[1]))
tokkies.append(t.string)
old = t.end
if txt.endswith(" "):
tokkies.append(" ")
toks = [x for x in tokkies if not x.startswith("#")]
return toks[1:]
class EncoderDecoder():
def __init__(self, maxlen, min_count, unknown, padding, tokenize, untokenize):
self.maxlen = maxlen
self.min_count = min_count
self.unknown = unknown
self.padding = padding
self.tokenize = tokenize
self.untokenize = untokenize
self.questions = []
self.answers = []
self.ex, self.dx = None, None
self.ey, self.dy = None, None
self.X, self.y = self.build_data()
def build_data(self):
raise NotImplementedError
def encode_x(self, x):
return self.ex.get(x, 0)
def encode_y(self, y):
return self.ey.get(y, 0)
def decode_x(self, x):
return self.dx.get(x, self.unknown)
def decode_y(self, y):
return self.dy.get(y, self.unknown)
def build_coders(self, tokens):
tokens = [item for sublist in tokens for item in sublist]
word_to_index = {k: v for k, v in Counter(tokens).items() if v >= self.min_count}
word_to_index = {k: i for i, (k, v) in enumerate(word_to_index.items(), 1)}
word_to_index[self.unknown] = 0
index_to_word = {v: k for k, v in word_to_index.items()}
index_to_word[0] = self.unknown
return word_to_index, index_to_word
def build_qa_coders(self):
self.ex, self.dx = self.build_coders(self.questions)
print("unique question tokens:", len(self.ex))
self.ey, self.dy = self.build_coders([self.answers])
print("unique answer tokens:", len(self.ey))
def get_xy(self):
n = len(self.questions)
X = np.zeros((n, self.maxlen, len(self.ex)), dtype=np.bool)
y = np.zeros((n, len(self.ey)), dtype=np.bool)
for num_pair, (question, answer) in enumerate(zip(self.questions, self.answers)):
for num_token, q_token in enumerate(question):
X[num_pair, num_token, self.encode_x(q_token)] = 1
y[num_pair, self.encode_y(answer)] = 1
return X, y
def pad(self, tokens):
seqlen = len(tokens)
return [self.padding] * (self.maxlen - seqlen + 1) + tokens
def encode_question(self, text):
X = np.zeros((1, self.maxlen, len(self.ex)), dtype=np.bool)
prepped = self.pad(self.tokenize(text)[-self.maxlen:])
for num, x in enumerate(prepped[1:]):
X[0, num, self.encode_x(x)] = 1
return X
class TextEncoderDecoder(EncoderDecoder):
def __init__(self, texts, tokenize=str.split, untokenize=" ".join,
window_step=3, maxlen=20, min_count=1,
unknown="UNKNOWN", padding="PADDING"):
self.texts = texts
self.window_step = window_step
c = super(TextEncoderDecoder, self)
c.__init__(maxlen, min_count, unknown, padding, tokenize, untokenize)
def build_data(self):
self.questions = []
self.answers = []
for text in self.texts:
tokens = self.tokenize(text)
text = self.pad(tokens)
seqlen = len(text)
for i in range(0, seqlen - self.maxlen, self.window_step):
self.questions.append(text[i: i + self.maxlen])
self.answers.append(text[i + self.maxlen])
self.build_qa_coders()
print("number of QA pairs:", len(self.questions))
return self.get_xy()
class QuestionAnswerEncoderDecoder(EncoderDecoder):
pass