-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
60 lines (47 loc) · 1.46 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy
import math
from numpy import *
def readCsv(filename):
file = open(filename)
data = []
featureCount = -1
for line in file.readlines():
items = line.strip().split(',')
items = map(int, items)
if len(items) == 0:
print "Empty line"
continue
if featureCount == -1:
featureCount = len(items)
if featureCount != len(items):
raise NameError('inconsistent feature counts! expected ' + str(featureCount) + ' was ' + str(len(items)))
data.append(items)
return mat(data)
def split(data, testRatio=0.2):
splitIndex = int(shape(data)[0] * (1 - testRatio)) - 1
return numpy.split(data, [splitIndex], axis=0)
def splitLabels(data):
return data[:, :-1], data[:, -1:]
def shuffle(data):
numpy.random.shuffle(data)
return data
def grab(data, count):
temp = data.copy()
numpy.random.shuffle(temp)
return numpy.split(temp, [count], axis=0)[0]
def nominalToVector(y, cats):
dims = shape(y)
if (dims[1] != 1):
raise NameError('Unexpected shape ' + dims)
result = zeros((dims[0], cats))
for i in range(dims[0]):
val = y.item((i, 0))
result[i][val] = 1
return result
def parse(filename, cats):
data = readCsv('test.csv')
data = shuffle(data)
train, test = split(data)
trainX, trainY = splitLabels(train)
testX, testY = splitLabels(test)
return trainX, trainY, testX, testY