-
Notifications
You must be signed in to change notification settings - Fork 13
/
read.py
65 lines (54 loc) · 2.09 KB
/
read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Description: Read initial dataset and decode it into a list. Here we replace all missing value and discretizate
the numerical values.
Input: initial dataset stored in *.data file, and scheme description stored in *.names file.
Output: a data list after pre-processing.
Author: CBA Studio
"""
import csv
# Read dataset and convert into a list.
# path: directory of *.data file.
def read_data(path):
data = []
with open(path, 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for line in reader:
data.append(line)
while [] in data:
data.remove([])
return data
# Read scheme file *.names and write down attributes and value types.
# path: directory of *.names file.
def read_scheme(path):
with open(path, 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
attributes = next(reader)
value_type = next(reader)
return attributes, value_type
# convert string-type value into float-type.
# data: data list returned by read_data.
# value_type: list returned by read_scheme.
def str2numerical(data, value_type):
size = len(data)
columns = len(data[0])
for i in range(size):
for j in range(columns-1):
if value_type[j] == 'numerical' and data[i][j] != '?':
data[i][j] = float(data[i][j])
return data
# Main method in this file, to get data list after processing and scheme list.
# data_path: tell where *.data file stores.
# scheme_path: tell where *.names file stores.
def read(data_path, scheme_path):
data = read_data(data_path)
attributes, value_type = read_scheme(scheme_path)
data = str2numerical(data, value_type)
return data, attributes, value_type
# just for test
if __name__ == '__main__':
import pre_processing
test_data_path = '/Users/liulizhi/Desktop/iris.data'
test_scheme_path = '/Users/liulizhi/Desktop/iris.names'
test_data, test_attributes, test_value_type = read(test_data_path, test_scheme_path)
result_data = pre_processing.pre_process(test_data, test_attributes, test_value_type)
print(result_data)