-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataSet.py
125 lines (103 loc) · 5.25 KB
/
DataSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
import csv
import re
import os
import numpy as np
from utils import utils
import random
from sklearn.model_selection import train_test_split
from Preprocess import Preprocess
import matplotlib.pyplot as plt
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
class DataSet():
def __init__(self, preprocess=False):
'''
Reading in article data from ../Data folder and generate dataframes contain news data;
:param preprocess: (Boolean) True, to clean data set. Deault=False
'''
self.__preprocess = preprocess # if yes, clean data set
self.__data_folder = os.path.join(os.getcwd(), 'data')
self.__train_bodies_csv = os.path.join(self.__data_folder, 'train_bodies.csv')
self.__train_stance_csv = os.path.join(self.__data_folder, 'train_stances.csv')
self.__test_bodies_csv = os.path.join(self.__data_folder, 'test_bodies.csv')
self.__test_stances_csv = os.path.join(self.__data_folder, 'competition_test_stances.csv')
self.__all = []
self.__train_all, self.__val_all, self.__test_all = self.__reader()
# self.plot_distribution()
def __reader(self):
'''
:return:
'''
"read in training sets"
train_bodies_df = pd.read_csv(r'' + self.__train_bodies_csv, delimiter=',', header=0, sep='\t',
names=['body_id', 'article'])
train_stances_df = pd.read_csv(r'' + self.__train_stance_csv, delimiter=',', header=0, sep='\t',
names=['headline', 'body_id', 'stance'])
train_stances_df['target'] = -1
# assign target number to each type of target name
for i in range(len(LABELS)):
train_stances_df.loc[train_stances_df['stance'] == LABELS[i], 'target'] = i
self.__all = train_stances_df
"read in testing sets"
test_bodies_df = pd.read_csv(r'' + self.__test_bodies_csv, delimiter=',', header=0, sep='\t',
names=['body_id', 'article'])
test_stances_df = pd.read_csv(r'' + self.__test_stances_csv, delimiter=',', header=0, sep='\t',
names=['headline', 'body_id', 'stance'])
test_stances_df['target'] = -1
# assign target number to each type of target name
for i in range(len(LABELS)):
test_stances_df.loc[test_stances_df['stance'] == LABELS[i], 'target'] = i
# left join tow dataframes
train_df = pd.merge(train_stances_df, train_bodies_df, on='body_id', how='left')
test_df = pd.merge(test_stances_df, test_bodies_df, on='body_id', how='left')
# clean two data sets
if(self.__preprocess):
print('Cleaning training set...\n')
train_df = Preprocess(train_df).preprocess(['headline', 'article'])
print('\nCleaning testing set...\n')
test_df = Preprocess(test_df).preprocess(['headline', 'article'])
# re-order column index, and drop some columns
train_df = train_df[['headline','article','stance']]
test_df = test_df[['headline', 'article', 'stance']]
# use target labels to uniformly split data set
train_all, val_all = train_test_split(train_df, train_size=0.9, random_state=0, stratify=train_df['stance'])
all_unrelated = train_df[train_df['stance'] == 'unrelated']
all_discuss = train_df[train_df['stance'] == 'discuss']
all_agree = train_df[train_df['stance'] == 'agree']
all_disagree = train_df[train_df['stance'] == 'disagree']
train_unrelated = train_all[train_all['stance'] == 'unrelated']
train_discuss = train_all[train_all['stance'] == 'discuss']
train_agree = train_all[train_all['stance'] == 'agree']
train_disagree = train_all[train_all['stance'] == 'disagree']
val_unrelated = val_all[val_all['stance'] == 'unrelated']
val_discuss = val_all[val_all['stance'] == 'discuss']
val_agree = val_all[val_all['stance'] == 'agree']
val_disagree = val_all[val_all['stance'] == 'disagree']
print('\n\tUnrltd\tDiscuss\t Agree\tDisagree')
print('All\t', len(all_unrelated), '\t', len(all_discuss), '\t', len(all_agree), '\t', len(all_disagree))
print('Train\t', len(train_unrelated), '\t', len(train_discuss), '\t', len(train_agree), '\t',
len(train_disagree))
print('Valid.\t', len(val_unrelated), '\t', len(val_discuss), '\t', len(val_agree), '\t', len(val_disagree))
train_all = np.array(train_all)
val_all = np.array(val_all)
test_all = np.array(test_df)
return train_all, val_all, test_all
def get_train(self):
return self.__train_all
def get_validation(self):
return self.__val_all
def get_test(self):
return self.__test_all
def plot_distribution(self):
# Generate data on commute times.
df = self.__all['stance'].value_counts()
# ax = df.plot().hist(grid=True, bins=4, rwidth=0.9,
# color='#607c8e')
df.plot(kind='bar', x=[1,2,3,4])
plt.title('label distribution')
plt.xlabel('class')
plt.ylabel('counts')
plt.grid(axis='y', alpha=0.75)
plt.show()
if __name__ == "__main__":
DataSet()