forked from anaavila/convert-csv-to-arff
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv-to-arff.py
136 lines (120 loc) · 4.54 KB
/
csv-to-arff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Converts comma separated value (CSV) files
# to Attribute-Relation File Format (ARFF).
# where 'converts' means that an arff file is created and populated with
# the csv data. The csv file is not deleted or modified.
#
# @Author Ana Avila - github.com/anaavila
# @Date December, 2016 - January, 2017
#
# Python 3.6
#
# @Description
# Simple python program that reads a csv file, selects all its attributes
# and assigns its data type ("numeric" or "nominal").
# Selects unique data values for each nominal attribute, and
# inserts a '0' on each empty cell.
#
# This program was made to facilitate some csv data cleaning when I was
# trying to open a csv file in Weka, for a school research project.
# This program helps to clean the csv file by converting it to arff format
# when the csv file has some inconsistencies, such as having numeric and
# nominal values for the same attribute values, and when it has empty cells.
#
# @About the ARFF format and Weka Software
# ARFF file format is used with Weka, a machine learning software from the
# University of Waikato. Information about the ARFF file and Weka is on the
# University of Waikato website: https://www.cs.waikato.ac.nz/ml/weka/arff.html
#
#
# Note:
# You can open the arff file with a text editor
#
import csv
import os
fileToRead = "worldcup2014.csv" #csv file name or absolute path to be open.
fileToWrite = "worldcup2014.arff" #name as how you'll save your arff file.
relation = "World Cup 2014" #how you'll like to call your relation as.
dataType = [] # Stores data types 'nominal' and 'numeric'
columnsTemp = [] # Temporary stores each column of csv file except the attributes
uniqueTemp = [] # Temporary Stores each data cell unique of each column
uniqueOfColumn = [] # Stores each data cell unique of each column
dataTypeTemp = [] # Temporary stores the data type for cells on each column
finalDataType = [] # Finally stores data types 'nominal' and 'numeric'
attTypes = [] # Stores data type 'numeric' and nominal data for attributes
p = 0 # pointer for each cell of csv file
writeFile = open(fileToWrite, 'w')
#Opening and Reading a CSV file
f = open(fileToRead, 'r')
reader = csv.reader(f)
allData = list(reader)
attributes = allData[0]
totalCols = len(attributes)
totalRows = len(allData)
f.close()
# Add a '0' for each empty cell
for j in range(0,totalCols):
for i in range(0,totalRows):
if 0 == len(allData[i][j]):
allData[i][j] = "0"
# check for comams or blanks and adds single quotes
for j in range(0,totalCols):
for i in range(1,totalRows):
allData[i][j] = allData[i][j].lower()
if "\r" in allData[i][j] or '\r' in allData[i][j] or "\n" in allData[i][j] or '\n' in allData[i][j]:
allData[i][j] = allData[i][j].rstrip(os.linesep)
allData[i][j] = allData[i][j].rstrip("\n")
allData[i][j] = allData[i][j].rstrip("\r")
try:
if allData[i][j] == str(float(allData[i][j])) or allData[i][j] == str(int(allData[i][j])):
print
except ValueError as e:
allData[i][j] = "'" + allData[i][j] + "'"
# fin gives unique cells for nominal and numeric
for j in range(0,totalCols):
for i in range(1,totalRows):
columnsTemp.append(allData[i][j])
for item in columnsTemp:
if not (item in uniqueTemp):
uniqueTemp.append(item)
uniqueOfColumn.append("{" + ','.join(uniqueTemp) + "}")
uniqueTemp = []
columnsTemp = []
# Assigns numeric or nominal to each cell
for j in range(1,totalRows):
for i in range(0,totalCols):
try:
if allData[j][i] == str(float(allData[j][i])) or allData[j][i] == str(int(allData[j][i])):
dataType.append("numeric")
except ValueError as e:
dataType.append("nominal")
for j in range(0,totalCols):
p = j
for i in range(0,(totalRows-1)):
dataTypeTemp.append(dataType[p])
p += totalCols
if "nominal" in dataTypeTemp:
finalDataType .append("nominal")
else:
finalDataType .append("numeric")
dataTypeTemp = []
for i in range(0,len(finalDataType )):
if finalDataType [i] == "nominal":
attTypes.append(uniqueOfColumn[i])
else:
attTypes.append(finalDataType[i])
# Show comments
writeFile.write("%\n% Comments go after a '%' sign.\n%\n")
writeFile.write("%\n% Relation: " + relation +"\n%\n%\n")
writeFile.write("% Attributes: " + str(totalCols) + " "*5
+ "Instances: " + str(totalRows-1) + "\n%\n%\n\n")
# Show Relation
writeFile.write("@relation " + relation + "\n\n")
# Show Attributes
for i in range(0,totalCols):
writeFile.write("@attribute" + " '" + attributes[i]
+ "' " + attTypes[i] + "\n")
# Show Data
writeFile.write("\n@data\n")
for i in range(1,totalRows):
writeFile.write(','.join(allData[i])+"\n")
print(fileToWrite + " was converted to " + fileToRead)