-
Notifications
You must be signed in to change notification settings - Fork 0
/
arfreader.py
executable file
·225 lines (200 loc) · 9.07 KB
/
arfreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""
Authors: Nick Gisolfi, Dan Howarth
"""
from json import dumps
from re import split as regex_split
class ARFReader:
"""Reads autonRF model file into sklearn random forest. This will only load the tree structure. The metadata for
the sklearn random forest will be all wiggidy-wack.
Currently this is only used to test if the outputs of python version for inbounds, mean entropy, etc match the
outputs of the C version.
Requires sklearn-json. Use fork at https://github.com/howarth/sklearn-json/ for a version compatible with more
recent sklearn files.
usage:
arf_reader = ARFReader(autonRF_c_model_file)
arf_reader.write2json(temporary_json_file)
rf_model = sklearn_json.from_json(temporary_json_file)
"""
UNALLOWED_CHARS = '!@#$%^&*()_+~`=\-\[\]\{\}:;\"\'<>,.\\/?|'
def __init__( self, fileName ):
with open(fileName,'r') as fp:
self.params = self.read_params(fp)
self.model = self.read_model(fp)
def make_safe_names( self, names ):
return [''.join([word.capitalize() for word in regex_split('['+self.UNALLOWED_CHARS+']+',attName)]) for attName in names]
def read_params( self, fp ):
params={}
#Skip ARF Preamble
assert fp.readline().strip()=='<bag_model>'
assert fp.readline().strip()=='1'
assert fp.readline().strip()=='<string_array>'
assert fp.readline().strip()=='size 1'
fp.readline().strip()
assert fp.readline().strip()=='</string_array>'
#Read Attribute Names
assert fp.readline().strip()=='<string_array>'
params['num_atts']=int(fp.readline().strip().split()[1])
params['att_names'] = self.make_safe_names( [fp.readline().strip() for attnum in range(params['num_atts'])] )
assert fp.readline().strip()=='</string_array>'
#Read #Trees and Target Attribute Number
params['num_trees']=int(fp.readline().strip())
params['target_attnum']=int(fp.readline().strip())
#Read Class Names
assert fp.readline().strip()=='<string_array>'
params['num_classes'] = int(fp.readline().strip().split()[1])
params['class_names'] = [fp.readline().strip() for classnum in range(params['num_classes'])]
assert fp.readline().strip()=='</string_array>'
return params
def read_model( self, fp ):
model={}
nID = 0
#Read Trees
for mID in range(1,self.params['num_trees']+1):
assert fp.readline().strip()=='<decision_tree>'
tree,nodeID = self.mk_node( fp, nID, mID )
model[mID] = tree
assert fp.readline().strip()=='</decision_tree>'
return model
def mk_node( self, fp, nID, mID ):
node={'nID':nID}
node['mID']=mID
assert fp.readline().strip()=='<decision_node>'
#Is this a leaf?
if fp.readline().strip()=='true':
node['is_leaf']=True
#is this leaf a classification leaf?
if fp.readline().strip() == 'true':
node['is_classification']=True
#read sample counts at this leaf
assert fp.readline().strip()=='<dyv>'
numElements=int(fp.readline().strip().split()[1])
node['distribution']=[int(fp.readline().strip()) for eID in range(numElements)]
node['nodes_json']=[-1,
-1,
-2,
-2.0,
0,
0,
0
]
node['values_json']=[[float(e) for e in node['distribution']]]
assert fp.readline().strip()=='</dyv>'
assert fp.readline().strip()=='</decision_node>'
else:
print('Regression Not Implemented')
#read mean output of the node NOT IMPLEMENTED
pass
else:
#this is not a leaf
node['is_leaf']=False
node['att_num']=int(fp.readline().strip())
node['att_name']=self.params['att_names'][node['att_num']]
node['is_symbolic']=fp.readline().strip()
node['contains_missing_values']=fp.readline().strip()
node['missing_value_decision_path']=fp.readline().strip()
node['threshold']=float(fp.readline().strip())
node['min']=fp.readline().strip()
node['max']=fp.readline().strip()
node['left_child_nID']=nID+1
LC,nID=self.mk_node(fp,nID+1,mID)
node['right_child_nID']=nID+1
RC,nID=self.mk_node(fp,nID+1,mID)
node['nodes_json']=[node['left_child_nID'],
node['right_child_nID'],
node['att_num'],
node['threshold'],
0, #gini or whatever
0, #number of samples in leaf
0 #weighted sum of samples in leaf
]
node['values_json']=[[0.0 for i in range(self.params['num_classes'])]]
node['left_child']=LC
node['right_child']=RC
assert fp.readline().strip()=='</decision_node>'
return node,nID
# It wound up being easiear to read in ARF, rewrite to file as JSON with scikit-learn-esque structure, then read back in
def write2json( self, filename ):
with open(filename, 'w') as writer:
writer.write(dumps(self.to_json()))
def nodes2json( self, json, node ):
json.append(node['nodes_json'])
if not node['is_leaf']:
json = self.nodes2json(json,node['left_child'])
json = self.nodes2json(json,node['right_child'])
return json
def values2json( self, json, node ):
json.append(node['values_json'])
if not node['is_leaf']:
json = self.values2json(json,node['left_child'])
json = self.values2json(json,node['right_child'])
return json
def dtypes2json( self, json, node ):
return ['<i8','<i8','<i8','<f8','<f8','<i8','<f8']
def get_node_count( self, count, node ):
count +=1
if not node['is_leaf']:
count = self.get_node_count(count,node['left_child'])
count = self.get_node_count(count,node['right_child'])
return count
def to_json( self ):
rf = {'meta':'rf',
'max_depth':0,
'min_samples_split':2,
'min_samples_leaf':1,
'min_weight_fraction_leaf':0.0,
'max_features':'auto',
'max_leaf_nodes':None,
'min_impurity_decrease':0.0,
'n_features_': self.params['num_atts'],
'n_outputs_':1,
'classes_': [i for i in range(self.params['num_classes'])],
'estimators_': [self.estimator2json(i) for i in range(1,self.params['num_trees']+1)],
'params':{'bootstrap':True,
'class_weight':None,
'criterion':'ALRF',
'max_depth':0,
'max_features':None,
'max_leaf_nodes':None,
'min_impurity_decrease':0.0,
'min_samples_leaf':1,
'min_samples_split':2,
'min_weight_fraction_leaf':0.0,
'n_estimators': self.params['num_trees'],
'n_jobs':None,
'oob_score':None,
'random_state':0,
'verbose':0,
'warm_start':False
},
'n_classes_':self.params['num_classes']
}
return rf
def estimator2json( self, mID ):
dt = {'meta':'decision-tree',
'feature_importances_':[0 for i in range(self.params['num_atts']-1)],
'max_features_':self.params['num_atts']-1,
'n_classes_':self.params['num_classes'],
'n_features_':self.params['num_atts']-1,
'n_outputs_':1,
'tree_':{'max_depth':0,
'node_count':self.get_node_count(0,self.model[mID]),
'nodes':self.nodes2json([],self.model[mID]),
'values':self.values2json([],self.model[mID]),
'nodes_dtype':self.dtypes2json([],self.model[mID])
},
'classes_':[0, 1, 2],
'params':{'class_weight':None,
'criterion':'ALRF',
'max_depth':0,
'max_features':'auto',
'max_leaf_nodes':None,
'min_impurity_decrease':0.0,
'min_samples_leaf':1,
'min_samples_split':2,
'min_weight_fraction_leaf':0.0,
#'presort':False,
'random_state':15599,
'splitter':'best'
}
}
return dt