-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfreq_tables.py
127 lines (105 loc) · 4.26 KB
/
freq_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
#******************************************************************************
#
# Copyright (C) 2019, Institute of Telecommunications, TU Wien
#
# Name : freq_tables.py
# Description : Frquency Tables Generator and One Hot Encoding transformation
# Author : Fares Meghdouri
#
#******************************************************************************
print("""
*************************************************************
Frquency Tables Generator and One Hot Encoding transformation
FM, Feb 2019, http://cn.tuwien.ac.at
*************************************************************
""")
#############################################################
# script name: freq_tables.py
version = "0.0.1"
#############################################################
import sys
import pandas as pd
import shutil
import os
#############################################################
output_folder = "frequency_tables"
column_exceptions = ['Attack', 'Label']
# defaults
mode = "stat"
input_file = ""
topn = 10
threshold = 95
out_file = "freq_tables_data.csv"
#############################################################
def _get_dummies(column, threshold):
column = column.copy()
counts = pd.value_counts(column) / column.shape[0] * 100
mask = column.isin(counts[counts >= threshold].index)
column[~mask] = "others"
tmp = pd.get_dummies(column, prefix=column.name)
try:
return tmp.drop('{}_others'.format(column.name), 1) # per default drop the others column to overcome the k-1 trap
except:
return tmp
def main():
print("Reading {}".format(input_file))
data = pd.read_csv(input_file).fillna(0)
if os.path.exists(output_folder):
shutil.rmtree(output_folder)
os.makedirs(output_folder)
print("Computing the frequency tables for {} feature...".format(len(data.columns)))
print("*********************************")
to_convert = []
for col in data.columns:
freq = (data[col].value_counts() / data.shape[0] * 100).rename_axis('unique_values').reset_index(name='counts').head(topn)
freq['Accumulation %'] = freq['counts'].cumsum(axis = 0)
freq.to_csv("{}/{}.csv".format(output_folder, col))
try:
acc = float(freq.tail(1)['Accumulation %'])
thr = float(freq.tail(1)['counts'])
except:
print(freq.tail(1)['Accumulation %'])
if mode.lower() == "ohe" and acc >= threshold and col not in column_exceptions:
statement = "top {}\t for {:50s} represent {:.2f} of the data \t {}".format(freq.shape[0], col, acc, "Convert?(yes:enter, no:any character+enter)")
try:
choice = input(statement)
except:
choice = raw_input(statement) # compatibility with python2
if choice =="":
tmp = _get_dummies(data[col], thr)
data = pd.concat([data, tmp], axis=1)
data = data.drop(col, axis=1)
to_convert.append(col)
else:
print("top {}\t for {:50s} represent {:.2f} of the data".format(freq.shape[0], col, acc))
if mode.lower() == "ohe":
print("*********************************")
print("The following features were converted into dummies:")
print(to_convert)
print("*********************************")
print("Saving...")
data.to_csv(out_file, index=False)
print("Finished without any errors... Exiting")
if __name__ == "__main__":
try:
mode = str(sys.argv[1])
input_file = str(sys.argv[2])
topn = int(sys.argv[3])
threshold = int(sys.argv[4])
out_file = str(sys.argv[5])
except:
print("""
Usage: > python freq_tables.py <mode> <data> <top-n> <threshold> <output>
<mode> stat: gives frequency tables of all features and save them into an external folder
ohe : same as stat but perform also one hot encoding of the top n values
<data> csv file holding the raw data
<top-n> the number of top values to be considered in the frequency tables and the dummy mapping
<threshold> for the one hot encoding cosider only those features where the <top-n> represent more than the threshold
<output> the output file (csv format)
Example: > python freq_tables.py ohe input.csv 5 98 output.csv
show statistics for the top five distinct values and map those who reaches 98\% representation only by 5 top
NOTE: even if you are using the stat mode, please provide an output file (it will not be created)
""")
exit(1)
main()