forked from UofUMetabolomicsCore/QSRR_Automator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
collect_default_values.py
195 lines (177 loc) · 11.5 KB
/
collect_default_values.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
Copyright (c) 2020 Bradley Naylor, James Cox and University of Utah
All rights reserved.
Redistribution and use in source and binary forms,
with or without modification, are permitted provided
that the following conditions are met:
* Redistributions of source code must retain the
above copyright notice, this list of conditions
and the following disclaimer.
* Redistributions in binary form must reproduce
the above copyright notice, this list of conditions
and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the author nor the names of any contributors
may be used to endorse or promote products derived
from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
"""
since the settings are based on .csvs to make it easy for the user or future programmers to modify we need to prevent issues.
the main issue is avoiding incorrect types, invalid values and so on. This prevents the need for tons of ifs or try excepts whenever we use the a setting or programmer setting
this is mostly a port of previous code created for Reportorator. a bit more complicated because of the need for two option sets but is otherwise similar
"""
import csv
import os
import multiprocessing
USER_SETTINGS_FILE = "User_Settings_Defaults.csv"
PROGRAMER_SETTINGS_FILE = "Programmer_Settings_Defaults.csv"
class String_Values(object):
#name is the name of the variable, lowere needed is a boolean to determine is we should force it to lower (case insensitive), allowed_values is a list of allowed values for this string, and boolean is a boolean to say if this should be turned into a boolean (True) or left as a string (False)
def __init__(self, name, lower_needed, allowed_values, boolean):
self.name = name
self.lower_needed = lower_needed
self.allowed_values = allowed_values
self.boolean = boolean
self.error = "" #if this get's filled we have an issue with this string
def check_value (self, value):
value = str(value)
if self.lower_needed:
value = value.lower()
if value not in self.allowed_values:
#!functional
self.error = "\"{}\" not a valid default value for variable \"{}\". Please change to a variable from the allowed list: {}".format(value, self.name, self.allowed_values)
elif self.boolean:
if value == "true":
self.value = True
elif value == "false":
self.value = False
else:
self.value = value
#this is a super class for int and float sub classes. the sub classes actually try to force the values into the proper variable types with appropriate error messages for failure
#this governs things like too low or too high
class Numerical_Values(object):
def __init__(self, name, minimum, maximum):
self.name = name
self.minimum = minimum
self.maximum = maximum
self.error = ""
def check_value(self,value):
if value < self.minimum:
#!functional
self.error = "{}, the default value provided for \"{}\" is less than the allowed minimum of {}".format(value, self.name, self.minimum)
elif value > self.maximum:
#!functional
self.error = "{}, the default value provided for \"{}\" is greater than the allowed maximum of {}".format(value, self.name, self.maximum)
else:
self.value = value
class Float_Values (Numerical_Values):
def check_value(self, value):
try:
value = float(value)
except ValueError:
#!functional
self.error = "{}, the default value provided for \"{}\" could not be converted to a floating point value. Please correct.".format(value, self.name)
return #no point in dealing with anything else if the error is already established
super().check_value(value)
class Int_Values (Numerical_Values):
def check_value(self, value):
try:
value = int(value)
except ValueError:
#!functional
self.error = "{} the default value provided for \"{}\" could not be converted to an integer. Please correct.".format(value, self.name)
return #no point in dealing with anything else if the error is already established
super().check_value(value)
#for most of the elements in these dictionaries the minimum and maximum values are not reasonable. the goal is to prevent entirely stupid values (negative values, percents greater than 100% on a filter)
#we also want to avoid values outside of those allowed by the settings gui so that it does not error out
user_settings_and_values = {
"Void volume cutoff": Float_Values("Void volume cutoff", 0.00, 9999.99),
"Allowed same value per descriptor": Float_Values("Allowed same value per descriptor", .1, .99),
"NaNs allowed per sample": Float_Values("NaNs allowed per sample", 0.0, .90),
"Correlation Coefficient (Pearson)": Float_Values("Correlation Coefficient (Pearson)", .05, 1.00),
"Excel Writing": String_Values("Excel Writing", True, ["true", "false"], True),
"Model To Use": String_Values("Model To Use", False, ["Choose Best", "Random Forest", "SVR", "LR"], False),
"Number of Cross Validations": Int_Values("Number of Cross Validations", 5, 10),
"Feature Selection Method": String_Values("Feature Selection Method", False, ["Automatic", "None", "Manual"], False),
"Automatic Feature Selection CV": Int_Values("Automatic Feature Selection CV", 5, 10),
"Min Number of Features": Int_Values("Min Number of Features", 5, 9999),
"Target Number Features for Manual": Int_Values("Target Number Features for Manual", 5, 9999),
"C min": Float_Values("C min", 0.000100, 99999.0),
"C max": Float_Values("C max", 0.000100, 99999.0),
"C number of steps": Int_Values("C number of steps", 1, 99),
"gamma min": Float_Values("gamma min", 0.000100, 99999.0),
"gamma max": Float_Values("gamma max", 0.000100, 99999.0),
"gamma number of steps": Int_Values("gamma number of steps", 1, 99)
}
programmer_settings_and_values = {
"Trees in Random Forest": Int_Values("Trees in Random Forest", 5, 100000),
"Allowed Model Loading Error": Float_Values("Allowed Model Loading Error", .001, .2),
"Minimum Machine Learning Descriptors": Int_Values("Minimum Machine Learning Descriptors", 3, 1000),
"Auto Feauture Selection Step Size": Int_Values("Auto Feauture Selection Step Size", 1, 20),
"Minimum Number of Samples Allowed": Int_Values("Minimum Number of Samples Allowed", 50, 1000)
}
#there are a few min max pairs we need to ensure that the minimum is <= the max. this is based of of the min_max_checker function in the settings_menu.py
#there are in order of minimum, maximum, step size, and if steps between them must be integers
user_settings_min_max = [["C min", "C max", "C number of steps", False], ["gamma min", "gamma max", "gamma number of steps", False]]
def deal_with_duplicates(dictionary_to_check, list_of_duplicates, filename):
for t in list_of_duplicates:
if len(t) == 4:
if dictionary_to_check[t[0]].value > dictionary_to_check[t[1]].value:
return "\"{}\" is greater than \"{}\" in \"{}\". Please correct.".format(t[0], t[1], filename)
if dictionary_to_check[t[0]].value == dictionary_to_check[t[1]].value and dictionary_to_check[t[2]].value != 1:
return "\"{}\" is equal to \"{}\" but \"{}\" is not 1. Values are in {}. Cannot have multiple steps between the same values. Please correct.".format(t[0], t[1], t[2], filename)
if dictionary_to_check[t[0]].value != dictionary_to_check[t[1]].value and dictionary_to_check[t[2]].value == 1:
return "\"{}\" is not equal to \"{}\" but \"{}\" is 1. Values are in {}. Cannot have multiple steps between the same values. Please correct.".format(t[0], t[1], t[2], filename)
if t[3] and ((dictionary_to_check[t[1]].value - dictionary_to_check[t[0]].value) < (dictionary_to_check[t[2]].value - 1)): #first, last and any integers between. 3-1 =2, so 3-1 steps is valid. 4 steps requires decimals.
return "\"{}\" and \"{}\" are close enough that it would require decimals to have provided number of steps. Please adjust minimum, maximum or step number in {}.".format(t[0], t[1], filename)
elif len(t) == 2:
if dictionary_to_check[t[0]].value >= dictionary_to_check[t[1]].value:
return "\"{}\" is greater than or equal to \"{}\" in {}. Please correct".format(t[0], t[1], filename)
return "Success"
#since we need to do this twice we'll make this function to be run with get_user_defaults so we can keep all relevant things here
def get_default_values(folder_location, get_user_defaults):
if get_user_defaults:
file_name_to_use = os.path.join(folder_location, USER_SETTINGS_FILE)
dict_to_use = user_settings_and_values
duplicate_list_to_check = user_settings_min_max
#need to go a step up in the output folder
default_settings = {"Output Folder": os.path.join(os.path.dirname(folder_location), "QSRR_Automator_Output"), "Processors_to_Use": multiprocessing.cpu_count()-1} #set the defaults for this one.
else:
file_name_to_use = os.path.join(folder_location, PROGRAMER_SETTINGS_FILE)
dict_to_use = programmer_settings_and_values
default_settings = {}
with open(file_name_to_use) as input_file:
reader = csv.reader(input_file)
#don't have a header so don't need to drop it
for row in reader:
if row[0] in dict_to_use.keys():
dict_to_use[row[0]].check_value(row[1])
if dict_to_use[row[0]].error:
return dict_to_use[row[0]].error
else:
default_settings[row[0]] = dict_to_use[row[0]].value
else:
return "The Variable Name \"{}\" is present in the {} file and should not be. please correct.".format(row[0], file_name_to_use)
#need to ensure we have all the variables
missing_variables = [x for x in list(dict_to_use.keys()) if x not in list(default_settings.keys())]
if missing_variables:
return "The following variables were not found in {} : {}. Please correct to proceed.".format(file_name_to_use, missing_variables)
if get_user_defaults: # no duplicates for programmer
final_error_check = deal_with_duplicates(dict_to_use, duplicate_list_to_check, file_name_to_use)
if final_error_check != "Success":
return final_error_check
return default_settings