forked from DebadityaPal/RoBERTa-NL2SQL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
roberta_training.py
274 lines (222 loc) · 9.15 KB
/
roberta_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import torch
device = torch.device("cuda")
def get_where_column(conds):
"""
[ [where_column, where_operator, where_value],
[where_column, where_operator, where_value], ...
]
"""
where_column = []
for cond in conds:
where_column.append(cond[0])
return where_column
def get_where_operator(conds):
"""
[ [where_column, where_operator, where_value],
[where_column, where_operator, where_value], ...
]
"""
where_operator = []
for cond in conds:
where_operator.append(cond[1])
return where_operator
def get_where_value(conds):
"""
[ [where_column, where_operator, where_value],
[where_column, where_operator, where_value], ...
]
"""
where_value = []
for cond in conds:
where_value.append(cond[2])
return where_value
def get_ground_truth_values(canonical_sql_queries):
ground_select_column = []
ground_select_aggregate = []
ground_where_number = []
ground_where_column = []
ground_where_operator = []
ground_where_value = []
for _, canonical_sql_query in enumerate(canonical_sql_queries):
ground_select_column.append( canonical_sql_query["sel"] )
ground_select_aggregate.append( canonical_sql_query["agg"])
conds = canonical_sql_query['conds']
if not canonical_sql_query["agg"] < 0:
ground_where_number.append( len( conds ) )
ground_where_column.append( get_where_column(conds) )
ground_where_operator.append( get_where_operator(conds) )
ground_where_value.append( get_where_value(conds) )
else:
raise EnvironmentError
return ground_select_column, ground_select_aggregate, ground_where_number, ground_where_column,\
ground_where_operator, ground_where_value
def get_wemb_roberta(roberta_config, model_roberta, tokenizer, nlu_t, hds, max_seq_length, num_out_layers_n=1, num_out_layers_h=1):
'''
wemb_n : word embedding of natural language question
wemb_h : word embedding of header
l_n : length of natural question
l_hs : length of header
nlu_tt : Natural language double tokenized
t_to_tt_idx : map first level tokenization to second level tokenization
tt_to_t_idx : map second level tokenization to first level tokenization
'''
# get contextual output of all tokens from RoBERTa
all_encoder_layer, i_nlu, i_headers,\
l_n, l_hpu, l_hs, \
nlu_tt, t_to_tt_idx, tt_to_t_idx = get_roberta_output(model_roberta, tokenizer, nlu_t, hds, max_seq_length)
# all_encoder_layer: RoBERTa outputs from all layers.
# i_nlu: start and end indices of question in tokens
# i_headers: start and end indices of headers
# get the wemb
wemb_n = get_wemb_n(i_nlu, l_n, roberta_config.hidden_size, roberta_config.num_hidden_layers, all_encoder_layer,
num_out_layers_n)
wemb_h = get_wemb_h(i_headers, l_hpu, l_hs, roberta_config.hidden_size, roberta_config.num_hidden_layers, all_encoder_layer,
num_out_layers_h)
return wemb_n, wemb_h, l_n, l_hpu, l_hs, \
nlu_tt, t_to_tt_idx, tt_to_t_idx
def get_roberta_output(model_roberta, tokenizer, nlu_t, headers, max_seq_length):
"""
Here, input is toknized further by RoBERTa Tokenizer and fed into RoBERTa
INPUT
:param model_roberta:
:param tokenizer: RoBERTa toknizer
:param nlu: Question
:param nlu_t: tokenized natural_language_utterance.
:param headers: Headers of the table
:param max_seq_length: max input token length
OUTPUT
tokens: RoBERTa input tokens
nlu_tt: RoBERTa-tokenized input natural language questions
orig_to_tok_index: map the index of 1st-level-token to the index of 2nd-level-token
tok_to_orig_index: inverse map.
"""
l_n = []
l_hs = [] # The length of columns for each batch
input_ids = []
input_mask = []
i_nlu = [] # index to retreive the position of contextual vector later.
i_headers = []
nlu_tt = []
t_to_tt_idx = []
tt_to_t_idx = []
for b, nlu_t1 in enumerate(nlu_t):
batch_headers = headers[b]
l_hs.append(len(batch_headers))
# 1. Tokenization using RoBERTa Tokenizer
tt_to_t_idx1 = [] # number indicates where sub-token belongs to in 1st-level-tokens
t_to_tt_idx1 = [] # orig_to_tok_idx[i] = start index of i-th-1st-level-token in all_tokens.
nlu_tt1 = []
for (i, token) in enumerate(nlu_t1):
t_to_tt_idx1.append(
len(nlu_tt1))
sub_tokens = tokenizer.tokenize(token, is_pretokenized=True)
for sub_token in sub_tokens:
tt_to_t_idx1.append(i)
nlu_tt1.append(sub_token)
nlu_tt.append(nlu_tt1)
tt_to_t_idx.append(tt_to_t_idx1)
t_to_tt_idx.append(t_to_tt_idx1)
l_n.append(len(nlu_tt1))
# <s> nlu </s> col1 </s> col2 </s> ...col-n </s>
# 2. Generate RoBERTa inputs & indices.
tokens, i_nlu1, i_batch_headers = generate_inputs(tokenizer, nlu_tt1, batch_headers)
input_ids1 = tokenizer.convert_tokens_to_ids(tokens)
# Input masks
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask1 = [1] * len(input_ids1)
# 3. Zero-pad up to the sequence length.
while len(input_ids1) < max_seq_length:
input_ids1.append(0)
input_mask1.append(0)
assert len(input_ids1) == max_seq_length
assert len(input_mask1) == max_seq_length
input_ids.append(input_ids1)
input_mask.append(input_mask1)
i_nlu.append(i_nlu1)
i_headers.append(i_batch_headers)
# Convert to tensor
all_input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)
all_input_mask = torch.tensor(input_mask, dtype=torch.long).to(device)
# 4. Generate RoBERTa output.
check, _, all_encoder_layer = model_roberta(input_ids=all_input_ids, attention_mask=all_input_mask, output_hidden_states=True)
all_encoder_layer = list(all_encoder_layer)
assert all((check == all_encoder_layer[-1]).tolist())
# 5. generate l_hpu from i_headers
l_hpu = gen_l_hpu(i_headers)
return all_encoder_layer, i_nlu, i_headers, \
l_n, l_hpu, l_hs, \
nlu_tt, t_to_tt_idx, tt_to_t_idx
def generate_inputs(tokenizer, nlu1_tok, hds1):
tokens = []
tokens.append("<s>")
i_st_nlu = len(tokens) # to use it later
for token in nlu1_tok:
tokens.append(token)
i_ed_nlu = len(tokens)
tokens.append("</s>")
i_headers = []
for i, hds11 in enumerate(hds1):
i_st_hd = len(tokens)
sub_tok = tokenizer.tokenize(hds11)
tokens += sub_tok
i_ed_hd = len(tokens)
i_headers.append((i_st_hd, i_ed_hd))
if i < len(hds1)-1:
tokens.append("</s>")
elif i == len(hds1)-1:
tokens.append("</s>")
else:
raise EnvironmentError
i_nlu = (i_st_nlu, i_ed_nlu)
return tokens, i_nlu, i_headers
def gen_l_hpu(i_headers):
"""
# Treat columns as if it is a batch of natural language utterance with batch-size = # of columns * # of batch_size
i_headers = [(17, 18), (19, 21), (22, 23), (24, 25), (26, 29), (30, 34)])
"""
l_hpu = []
for i_header in i_headers:
for index_pair in i_header:
l_hpu.append(index_pair[1] - index_pair[0])
return l_hpu
def get_wemb_n(i_nlu, l_n, hS, num_hidden_layers, all_encoder_layer, num_out_layers_n):
"""
Get the representation of each tokens.
"""
bS = len(l_n)
l_n_max = max(l_n)
wemb_n = torch.zeros([bS, l_n_max, hS * num_out_layers_n]).to(device)
for b in range(bS):
l_n1 = l_n[b]
i_nlu1 = i_nlu[b]
for i_noln in range(num_out_layers_n):
i_layer = num_hidden_layers - 1 - i_noln
st = i_noln * hS
ed = (i_noln + 1) * hS
wemb_n[b, 0:(i_nlu1[1] - i_nlu1[0]), st:ed] = all_encoder_layer[i_layer][b, i_nlu1[0]:i_nlu1[1], :]
return wemb_n
def get_wemb_h(i_headers, l_hpu, l_hs, hS, num_hidden_layers, all_encoder_layer, num_out_layers_h):
"""
As if
[ [table-1-col-1-tok1, t1-c1-t2, ...],
[t1-c2-t1, t1-c2-t2, ...].
...
[t2-c1-t1, ...,]
]
"""
bS = len(l_hs)
l_hpu_max = max(l_hpu)
num_of_all_hds = sum(l_hs)
wemb_h = torch.zeros([num_of_all_hds, l_hpu_max, hS * num_out_layers_h]).to(device)
b_pu = -1
for b, i_header in enumerate(i_headers):
for b1, index_pair in enumerate(i_header):
b_pu += 1
for i_nolh in range(num_out_layers_h):
i_layer = num_hidden_layers - 1 - i_nolh
st = i_nolh * hS
ed = (i_nolh + 1) * hS
wemb_h[b_pu, 0:(index_pair[1] - index_pair[0]), st:ed] \
= all_encoder_layer[i_layer][b, index_pair[0]:index_pair[1],:]
return wemb_h