-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
173 lines (148 loc) · 7.23 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import pickle
import re
import string
from collections import Counter
import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from spacy.lang.pl import Polish
from config import Parser
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
class Dataset:
def __init__(self, texts_file, tags_file, clean_data=True, remove_stopwords=False, is_train=True):
self.args = Parser().get_sections(['GENERAL', 'RNN', 'FLAIR'])
self.max_sent_length = int(self.args['max_sent_length'])
self.batch_size = int(self.args['batch_size'])
self.emb_size = int(self.args['emb_size'])
self.clean_data = clean_data
self.remove_stopwords = remove_stopwords
self.is_train = is_train
self.nlp = Polish()
self.df = self.build_dataframe(texts_file, tags_file)
self.unk_emb = self.get_random_emb(self.emb_size)
self.word2idx, self.idx2word = self.build_dict()
if self.is_train:
self.embeddings = self.get_embeddings(self.args['emb_path'])
def build_dataframe(self, texts_file, tags_file):
with open(texts_file) as file:
lines = [line.strip() for line in file.readlines()]
texts = pd.DataFrame(lines, columns=['text'])
tags = pd.read_fwf(tags_file, header=None, names=['tag'])
df = pd.concat([texts, tags], axis=1)
df['tokens'] = df['text'].map(lambda x: self.preprocess_sentence(x))
df['length'] = df['tokens'].map(lambda x: len(x))
df['clean_text'] = df['tokens'].map(lambda x: " ".join(x))
if self.clean_data:
df = self.clean(df)
return df
def preprocess_sentence(self, sentence):
sentence = sentence.replace(r"\n", "").replace(r"\r", "").replace(r"\t", "").replace("„", "").replace("”", "")
doc = [tok for tok in self.nlp(sentence)]
if not self.clean_data and str(doc[0]) == "RT":
doc.pop(0)
while str(doc[0]) == "@anonymized_account":
doc.pop(0)
while str(doc[-1]) == "@anonymized_account":
doc.pop()
if self.remove_stopwords:
doc = [tok for tok in doc if not tok.is_stop]
doc = [tok.lower_ for tok in doc]
doc = ["".join(c for c in tok if not c.isdigit() and c not in string.punctuation) for tok in doc]
doc = [RE_EMOJI.sub(r'', tok) for tok in doc]
doc = [tok.strip() for tok in doc if tok.strip()]
return doc
def build_dict(self):
if self.is_train:
sentences = self.df['tokens']
all_tokens = [token for sentence in sentences for token in sentence]
words_counter = Counter(all_tokens).most_common()
word2idx = {
self.args['pad']: 0,
self.args['unk']: 1
}
for word, _ in words_counter:
word2idx[word] = len(word2idx)
with open(self.args['word_dict_path'], 'wb') as dict_file:
pickle.dump(word2idx, dict_file)
else:
with open(self.args['word_dict_path'], 'rb') as dict_file:
word2idx = pickle.load(dict_file)
idx2word = {idx: word for word, idx in word2idx.items()}
return word2idx, idx2word
def transform_dataset(self):
sentences = self.df['tokens'].values
x = [sentence[:self.max_sent_length] for sentence in sentences]
x = [sentence + [self.args['pad']] * (self.max_sent_length - len(sentence)) for sentence in x]
x = [[self.word2idx.get(word, self.word2idx[self.args['unk']]) for word in sentence] for sentence in x]
y = self.df['tag'].values
return np.array(x), np.array(y)
def parse_dataset(self):
x, y = self.transform_dataset()
if self.is_train:
x, y = shuffle(x, y, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.15, random_state=42, stratify=y)
return list(self.chunks(train_x, train_y, self.batch_size)), valid_x, valid_y
return list(self.chunks(x, y, self.batch_size))
def get_embeddings(self, embeddings_file):
emb_list = []
print("Loading vectors...")
word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
print("Vectors loaded...")
for _, word in sorted(self.idx2word.items()):
if word == self.args['pad']:
word_vec = np.zeros(self.emb_size)
elif word == self.args['unk']:
word_vec = self.unk_emb
else:
try:
word_vec = word_vectors.word_vec(word)
except KeyError:
word_vec = self.unk_emb
emb_list.append(word_vec)
return np.array(emb_list, dtype=np.float32)
def get_class_weight(self):
y = self.df['tag'].values
_, counts = np.unique(y, return_counts=True)
return np.array(1 - counts / y.size)
def prepare_flair_format(self, column_name):
X = self.df[column_name].values
y = self.df['tag'].values
train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
train_x, dev_x, train_y, dev_y = train_test_split(train_x, train_y, test_size=0.15, random_state=42,
stratify=train_y)
train_data = [f"__label__{label} {text}" for text, label in zip(train_x, train_y)]
val_data = [f"__label__{label} {text}" for text, label in zip(val_x, val_y)]
dev_data = [f"__label__{label} {text}" for text, label in zip(dev_x, dev_y)]
flair_dir_path = self.args['flair_data_path']
os.makedirs(flair_dir_path, exist_ok=True)
with open(os.path.join(flair_dir_path, f"train_{column_name}.txt"), 'w') as train_file, \
open(os.path.join(flair_dir_path, f"dev_{column_name}.txt"), 'w') as dev_file, \
open(os.path.join(flair_dir_path, f"test_{column_name}.txt"), 'w') as val_file:
train_file.write("\n".join(train_data))
dev_file.write("\n".join(dev_data))
val_file.write("\n".join(val_data))
def print_stats(self):
print(self.df['length'].describe())
print(self.df['length'].quantile(0.95, interpolation='lower'))
print(self.df['length'].quantile(0.99, interpolation='lower'))
print(self.df.shape)
print(self.df['tag'].value_counts())
@staticmethod
def get_random_emb(length):
return np.random.uniform(-0.25, 0.25, length)
@staticmethod
def clean(dataframe):
dataframe = dataframe.drop_duplicates('clean_text')
return dataframe[(dataframe['tokens'].apply(lambda x: "rt" not in x[:1])) & (dataframe['length'] > 1)]
@staticmethod
def chunks(inputs, outputs, batch_size):
for i in range(0, len(inputs), batch_size):
yield inputs[i:i + batch_size], outputs[i:i + batch_size]
if __name__ == "__main__":
dt = Dataset("../data/task_6-2/training_set_clean_only_text.txt",
"../data/task_6-2/training_set_clean_only_tags.txt",
True, False, False)
# dt.prepare_flair_format("clean_text")