-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
101 lines (76 loc) · 2.51 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from collections import defaultdict
import numpy as np
import torch
TRAIN_FILE = "./train.txt"
TEST_FILE = "./test.txt"
N_TOKENS = 8000
# index 0: source, index 1: reference, index 2: candidate, index 3: score, index 4: label
def split(file):
with open(file, "r") as f:
text = f.read()
split_text = text.split("\n\n")
# Purposely left out punctuation removal
split_text = list(
map(
lambda x: [
x.split("\n")[0] + " " + x.split("\n")[2],
float(x.split("\n")[3]),
x.split("\n")[4],
],
split_text,
)
)
return split_text
def translate_to_integer(data):
word_to_idx = defaultdict(int)
idx = 0
result = []
for sample in data:
in_text = sample[0]
score = sample[1]
label = sample[2]
in_text_ints = []
for word in in_text.split():
if word not in word_to_idx and idx <= N_TOKENS:
# this way unknowns all automatically get assigned to '0'
idx += 1
word_to_idx[word] = idx
if word_to_idx[word] != 0:
in_text_ints.append(word_to_idx[word])
result.append([torch.tensor(in_text_ints), score, label])
return result
def pad(input, total_length):
for sample_idx in range(len(input)):
text_in = input[sample_idx][0]
while len(text_in) < total_length:
text_in.append(0)
input[sample_idx][0] = torch.tensor(text_in)
return input
def split_data_labels(data):
data = np.array(data)
return data[:, 0], data[:, 1], data[:, 2]
def give_numeric_labels(labels):
for i in range(len(labels)):
if labels[i] == "H":
labels[i] = torch.tensor([0])
else:
labels[i] = torch.tensor([1])
def output_train_ints(train_file=True):
if train_file:
data = split(TRAIN_FILE)
else:
data = split(TEST_FILE)
padded_ints = translate_to_integer(data)
sample, score, labels = split_data_labels(padded_ints)
give_numeric_labels(labels)
return sample, score, labels
def get_label_from_output(output):
return "H" if torch.argmax(output[0]).item() == 0 else "M"
def output_train_words(train_file=True):
if train_file:
data = split(TRAIN_FILE)
else:
data = split(TEST_FILE)
samples, scores, labels = split_data_labels(data)
give_numeric_labels(labels)
return samples, scores, labels