-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata.py
166 lines (132 loc) · 6.66 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import csv
import os
import numpy as np
import torch
from torch.utils.data import Dataset
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
self.idx2count = []
def add_word(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.idx2count.append(1)
self.word2idx[word] = len(self.idx2word) - 1
else:
self.idx2count[self.word2idx[word]] += 1
return self.word2idx[word]
def __len__(self):
return len(self.idx2word)
class Corpus(object):
def __init__(self, path_train, path_test):
self.dictionary_title = Dictionary()
self.dictionary_authors = Dictionary()
# training set is ICLR 2017 list.
# you can add more training set as much as you want!
self.add_corpus(os.path.join(path_train, 'ICLR_2017_accepted.txt'))
self.add_corpus(os.path.join(path_train, 'ICLR_2017_rejected.txt'))
# test set is ICLR 2018 list. DO NOT MODIFY!
self.add_corpus(os.path.join(path_test, 'ICLR_2018_accepted.txt'))
self.add_corpus(os.path.join(path_test, 'ICLR_2018_rejected.txt'))
# sort the words by word frequency in descending order
idx_argsorted_title = np.flip(np.argsort(self.dictionary_title.idx2count), axis=-1)
idx_argsorted_authors = np.flip(np.argsort(self.dictionary_authors.idx2count), axis=-1)
# re-create given the sorted ones
self.dictionary_title.idx2count = np.array(self.dictionary_title.idx2count)[idx_argsorted_title].tolist()
self.dictionary_title.idx2word = np.array(self.dictionary_title.idx2word)[idx_argsorted_title].tolist()
self.dictionary_title.word2idx = dict(zip(self.dictionary_title.idx2word,
np.arange(len(self.dictionary_title.idx2word)).tolist()))
self.dictionary_authors.idx2count = np.array(self.dictionary_authors.idx2count)[idx_argsorted_authors].tolist()
self.dictionary_authors.idx2word = np.array(self.dictionary_authors.idx2word)[idx_argsorted_authors].tolist()
self.dictionary_authors.word2idx = dict(zip(self.dictionary_authors.idx2word,
np.arange(len(self.dictionary_authors.idx2word)).tolist()))
self.train_accepted = self.tokenize(os.path.join(path_train, 'ICLR_2017_accepted.txt'))
self.train_rejected = self.tokenize(os.path.join(path_train, 'ICLR_2017_rejected.txt'))
self.test_accepted = self.tokenize(os.path.join(path_test, 'ICLR_2018_accepted.txt'))
self.test_rejected = self.tokenize(os.path.join(path_test, 'ICLR_2018_rejected.txt'))
def add_corpus(self, path):
"""Tokenizes a txt file."""
assert os.path.exists(path)
# Add words to the dictionary
with open(path, 'r', encoding="utf8") as f:
line_count = 0
tokens_title = 0
tokens_authors = 0
# each row has: (paper name) + (tab) + (authors delimited by ", ")
for row in f:
# lowercase the string, split the title by space, and split the authors by ", "
row = row.split('\t')
title = row[0].lower().strip('\n').split()
# ICLR 2017 authors has a dirty \xa0 instead of space. replace it
authors = row[1].replace(u'\xa0', u' ')
authors = authors.lower().strip('\n').split(', ')
# increase the token count
tokens_title += len(title)
tokens_authors += len(authors)
# add the word to the Dictionary
for word in title:
self.dictionary_title.add_word(word)
for word in authors:
self.dictionary_authors.add_word(word)
#return tokens_title, tokens_authors
def tokenize(self, path):
ids_title = []
ids_authors = []
# Tokenize file content
with open(path, 'r', encoding="utf8") as f:
for row in f:
row = row.split('\t')
title = row[0].lower().strip('\n').split()
authors = row[1].replace(u'\xa0', u' ')
authors = authors.lower().strip('\n').split(', ')
id_title = []
id_authors = []
for word in title:
id_title.append(self.dictionary_title.word2idx[word])
for word in authors:
id_authors.append(self.dictionary_authors.word2idx[word])
ids_title.append(id_title)
ids_authors.append(id_authors)
return [ids_title, ids_authors]
class PaperDecisionDataset(Dataset):
def __init__(self, x_title, x_authors, y):
self.x_title = x_title
self.x_authors = x_authors
self.y = y
assert len(self.x_title) == len(self.x_authors) == len(self.y)
def __len__(self):
return len(self.x_title)
def __getitem__(self, index):
return self.x_title[index], self.x_authors[index], self.y[index]
def create_corpus(path_train, path_test):
corpus = Corpus(path_train, path_test)
x_title_accepted, x_authors_accepted = corpus.train_accepted[0], corpus.train_accepted[1]
# assign accepted papers as label zero
y_accepted = np.zeros(len(corpus.train_accepted[0]), dtype=np.long).tolist()
x_title_rejected, x_authors_rejected = corpus.train_rejected[0], corpus.train_rejected[1]
# assign accepted papers as lable one
y_rejected = np.ones(len(corpus.train_rejected[0]), dtype=np.long).tolist()
x_title_train = x_title_accepted + x_title_rejected
x_authors_train = x_authors_accepted + x_authors_rejected
y_train = y_accepted + y_rejected
x_title_accepted, x_authors_accepted = corpus.test_accepted[0], corpus.test_accepted[1]
# assign accepted papers as label zero
y_accepted = np.zeros(len(corpus.test_accepted[0]), dtype=np.long).tolist()
x_title_rejected, x_authors_rejected = corpus.test_rejected[0], corpus.test_rejected[1]
# assign accepted papers as lable one
y_rejected = np.ones(len(corpus.test_rejected[0]), dtype=np.long).tolist()
x_title_test = x_title_accepted + x_title_rejected
x_authors_test = x_authors_accepted + x_authors_rejected
y_test = y_accepted + y_rejected
return corpus, x_title_train, x_authors_train, y_train, x_title_test, x_authors_test, y_test
def collate_fn(data):
# custom collate fn for PaperDecisionDataset
title = []
authors = []
decision = []
for datapoint in data:
title.append(datapoint[0])
authors.append(datapoint[1])
decision.append(datapoint[2])
return title, authors, decision