-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_cleanup.py
88 lines (76 loc) · 2.77 KB
/
data_cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import csv
import sys
import datetime
from dateutil import parser
from collections import defaultdict
import os
def cleanData(inFile, outFile):
count = 1
stats = {}
dropStats = defaultdict(int)
print('cleaning {}'.format(inFile))
with open(inFile, 'r') as csvfile:
data = csvfile.readlines()
totalRows = len(data)
print('total rows read = {}'.format(totalRows))
header = data[0]
for line in data[1:]:
line = line.strip()
cols = line.split(',')
key = cols[-1]
if line.startswith('D') or line.find('Infinity') >= 0 or line.find('infinity') >= 0:
dropStats[key] += 1
continue
dt = parser.parse(cols[2]) # '1/3/18 8:17'
epochs = (dt - datetime.datetime(1970, 1, 1)).total_seconds()
cols[2] = str(epochs)
line = ','.join(cols)
# clean_data.append(line)
count += 1
if key in stats:
stats[key].append(line)
else:
stats[key] = [line]
"""
if count >= 1000:
break
"""
with open(outFile+".csv", 'w') as csvoutfile:
csvoutfile.write(header)
with open(outFile + ".stats", 'w') as fout:
fout.write('Total Clean Rows = {}; Dropped Rows = {}\n'.format(
count, totalRows - count))
for key in stats:
fout.write('{} = {}\n'.format(key, len(stats[key])))
line = '\n'.join(stats[key])
csvoutfile.write('{}\n'.format(line))
with open('{}-{}.csv'.format(outFile, key), 'w') as labelOut:
labelOut.write(header)
labelOut.write(line)
for key in dropStats:
fout.write('Dropped {} = {}\n'.format(key, dropStats[key]))
print('all done writing {} rows; dropped {} rows'.format(
count, totalRows - count))
def cleanAllData():
# inputDataPath = os.path.join(
# os.path.dirname(os.path.realpath(__file__)), )
inputDataPath = '../ProcessedTrafficData'
outputDataPath = '../NewCleanedData'
if (not os.path.exists(outputDataPath)):
os.mkdir(outputDataPath)
files = os.listdir(inputDataPath)
for file in files:
if file.startswith('.'):
continue
if os.path.isdir(file):
continue
outFile = os.path.join(outputDataPath, file)
inputFile = os.path.join(inputDataPath, file)
cleanData(inputFile, outFile)
if __name__ == "__main__":
if len(sys.argv) < 2:
print('Usage: python data_cleanup.py inputFile.csv outputFile')
elif sys.argv[1] == 'all':
cleanAllData()
else:
cleanData(sys.argv[1], sys.argv[2])