forked from zygmuntz/pylearn2-practice
-
Notifications
You must be signed in to change notification settings - Fork 2
/
adult_dataset.py
95 lines (75 loc) · 2.32 KB
/
adult_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
adult dataset wrapper for pylearn2
"""
import csv
import numpy as np
import os
from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
from pylearn2.utils import serial
from pylearn2.utils.string_utils import preprocess
class AdultDataset( DenseDesignMatrix ):
def __init__(self,
path = 'train.csv',
one_hot = False,
with_labels = True,
start = None,
stop = None,
preprocessor = None,
fit_preprocessor = False,
fit_test_preprocessor = False):
"""
which_set: A string specifying which portion of the dataset
to load. Valid values are 'train' or 'public_test'
base_path: The directory containing the .csv files from kaggle.com.
This directory should be writable; if the .csv files haven't
already been converted to npy, this class will convert them
to save memory the next time they are loaded.
fit_preprocessor: True if the preprocessor is allowed to fit the
data.
fit_test_preprocessor: If we construct a test set based on this
dataset, should it be allowed to fit the test set?
"""
# self._iter_targets = True # whatever that means / won't work
self.no_classes = 2
# won't work TODO
self.test_args = locals()
self.test_args['which_set'] = 'test'
self.test_args['fit_preprocessor'] = fit_test_preprocessor
del self.test_args['start']
del self.test_args['stop']
del self.test_args['self']
path = preprocess(path)
X, y = self._load_data( path, with_labels )
if start is not None:
assert which_set != 'test'
assert isinstance(start, int)
assert isinstance(stop, int)
assert start >= 0
assert start < stop
assert stop <= X.shape[0]
X = X[start:stop, :]
if y is not None:
y = y[start:stop, :]
super(AdultDataset, self).__init__(X=X, y=y)
if preprocessor:
preprocessor.apply(self, can_fit=fit_preprocessor)
def _load_data(self, path, expect_labels):
assert path.endswith('.csv')
data = np.loadtxt( path, delimiter = ',', dtype = 'int' )
if expect_labels:
y = data[:,0]
X = data[:,1:]
# TODO: if one_hot
# 10 is number of possible y values
one_hot = np.zeros((y.shape[0], self.no_classes ),dtype='float32')
for i in xrange( y.shape[0] ):
label = y[i]
if label == 1:
one_hot[i,1] = 1.
else:
one_hot[i,0] = 1.
y = one_hot
else:
X = data
y = None
return X, y