-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_process.py
executable file
·36 lines (29 loc) · 1.2 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""
it's used to create a class which is the subclass of Dataset, it serves as the argument of dataloader
attention that in different system, default encoding of files is different, it's matter to specify encoding
"""
import pickle
import numpy
from nltk import word_tokenize
from torch.utils.data import Dataset
class CorpusData(Dataset):
def __init__(self):
with open('data/corpus_10w_train.txt', 'r', encoding='utf-8') as file: # attention the way of file is opened
start = ""
end = ""
self.text = [start + line.strip() + end for line in file]
with open('data/id_dic_10w.pkl', 'rb') as file:
self.id_dic = pickle.load(file)
def __len__(self):
return len(self.text)
def __getitem__(self, index):
sen = self.text[index] # get sentence at index position
sen_split = word_tokenize(sen) # get a list consist of single word in the sentence
inputs = numpy.zeros((1, 30)) # used to pad sentence
num = 0
for word in sen_split:
inputs[0, num] = self.id_dic[word]
num += 1
if(num >= 30): # at most store 30 words
break
return inputs, num