-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlevel_2_optionals_cdsu.py
205 lines (150 loc) · 14.7 KB
/
level_2_optionals_cdsu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import sys
import time
import logging
import pandas as pd
from traceback import format_exc
import level_2_optionals_cdsu_options
from level_2_optionals_cdsu_options import project_id
from modules.level_1_a_data_acquisition import sql_retrieve_df
from modules.level_1_b_data_processing import null_analysis, options_scraping_v2, remove_zero_price_total_vhe, lowercase_column_conversion, remove_rows, remove_columns, string_replacer, color_replacement, new_column_creation, score_calculation, duplicate_removal, total_price, margin_calculation, new_features, column_rename
from modules.level_1_d_model_evaluation import data_grouping_by_locals_temp
from modules.level_1_e_deployment import sql_inject, sql_delete, sql_date_comparison
from modules.level_0_performance_report import performance_info_append, error_upload, log_record, project_dict, performance_info
pd.set_option('display.expand_frame_repr', False)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', datefmt='%H:%M:%S @ %d/%m/%y', filename=level_2_optionals_cdsu_options.log_files['full_log'], filemode='a')
logging.Logger('errors')
# logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) # Allows the stdout to be seen in the console
logging.getLogger().addHandler(logging.StreamHandler(sys.stderr)) # Allows the stderr to be seen in the console
configuration_parameters = level_2_optionals_cdsu_options.selected_configuration_parameters
dict_sql_upload_flag = 0
def main():
log_record('Projeto: Sugestão Encomenda CDSU - Viaturas', project_id)
query_filters = {'NLR_CODE': '4R0', 'Franchise_Code_DW': '43'}
df = data_acquisition(query_filters)
control_prints(df, 'after getting data', head=1, date=1)
df = data_processing(df)
model_choice_message, df, vehicle_count = data_grouping_by_locals_temp(df, configuration_parameters, level_2_optionals_cdsu_options.project_id)
control_prints(df, 'before deployment', head=1)
deployment(df, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['final_table'])
performance_info(level_2_optionals_cdsu_options.project_id, level_2_optionals_cdsu_options, model_choice_message, vehicle_count)
log_record('Conclusão com sucesso - Projeto {}.\n'.format(project_dict[project_id]), project_id)
def data_acquisition(query_filters):
performance_info_append(time.time(), 'Section_A_Start')
log_record('Início Secção A...', project_id)
df = sql_retrieve_df(level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['initial_table'], level_2_optionals_cdsu_options, list(level_2_optionals_cdsu_options.sql_to_code_renaming.keys()), query_filters, column_renaming=1, parse_dates=['Purchase_Date', 'Sell_Date'])
# project_units_count_checkup(df, 'Nº Stock', level_2_optionals_cdsu_options, sql_check=0)
log_record('Fim Secção A.', project_id)
performance_info_append(time.time(), 'Section_A_End')
return df
def control_prints(df, tag, head=0, save=0, null_analysis_flag=0, date=0):
# print('{}\n{}'.format(tag, df.shape))
# try:
# print('Unique Vehicles: {}'.format(df['Nº Stock'].nunique()))
#
# except KeyError:
# print('Unique Vehicles: {}'.format(df['VHE_Number'].nunique()))
#
# if head:
# print(df.head())
# if save:
# df.to_csv('dbs/cdsu_control_save_tag_{}.csv'.format(tag))
# if null_analysis_flag:
# null_analysis(df)
# if date:
# try:
# print('Current Max Sell Date is {}'.format(max(df['Data Venda'])))
# except KeyError:
# print('Current Max Sell Date is {}'.format(max(df['Sell_Date'])))
return
def data_processing(df):
performance_info_append(time.time(), 'Section_B_Start')
log_record('Início Secção B...', project_id)
log_record('Checkpoint não encontrado ou demasiado antigo. A processar dados...', project_id)
df = lowercase_column_conversion(df, ['Opcional', 'Cor', 'Interior', 'Versão']) # Lowercases the strings of these columns
dict_strings_to_replace = {('Modelo', ' - não utilizar'): '', ('Interior', '\\|'): '/', ('Cor', '\\|'): '', ('Interior', 'ind.'): '', ('Interior', '\\]'): '/', ('Interior', '\\.'): ' ', ('Interior', '\'merino\''): 'merino', ('Interior', '\' merino\''): 'merino', ('Interior', '\'vernasca\''): 'vernasca', ('Interior', 'leder'): 'leather',
('Interior', 'p '): 'pele', ('Interior', 'pelenevada'): 'pele nevada', ('Opcional', 'bi-xénon'): 'bixénon', ('Opcional', 'bi-xenon'): 'bixénon', ('Opcional', 'vidro'): 'vidros', ('Opcional', 'dacota'): 'dakota', ('Opcional', 'whites'): 'white', ('Opcional', 'beige'): 'bege', ('Interior', '\'dakota\''): 'dakota', ('Interior', 'dacota'): 'dakota',
('Interior', 'mokka'): 'mocha', ('Interior', 'beige'): 'bege', ('Interior', 'dakota\''): 'dakota', ('Interior', 'antracite/cinza/p'): 'antracite/cinza/preto', ('Interior', 'antracite/cinza/pretoreto'): 'antracite/cinza/preto', ('Interior', 'nevada\''): 'nevada',
('Interior', '"nappa"'): 'nappa', ('Interior', 'anthrazit'): 'antracite', ('Interior', 'antracito'): 'antracite', ('Interior', 'preto/laranja/preto/lara'): 'preto/laranja', ('Interior', 'anthtacite'): 'antracite',
('Interior', 'champag'): 'champagne', ('Interior', 'cri'): 'crimson', ('Modelo', 'Enter Model Details'): '', ('Registration_Number', '\.'): '', ('Interior', 'preto/m '): 'preto ', ('Interior', 'congnac/preto'): 'cognac/preto',
('Local da Venda', 'DCN'): 'DCP', ('Cor', 'oceanao'): 'oceano', ('Cor', 'ocenao'): 'oceano', ('Interior', 'reto'): 'preto', ('Cor', 'banco'): 'branco', ('Cor', 'catanho'): 'castanho', ('Cor', 'petrìleo'): 'petróleo', ('Interior', 'ecido'): 'tecido',
('Interior', 'ege'): 'bege', ('Interior', 'inza'): 'cinza', ('Interior', 'inzento'): 'cinzento', ('Interior', 'teciso'): 'tecido', ('Opcional', 'autmático'): 'automático', ('Opcional', 'esctacionamento'): 'estacionamento',
('Opcional', 'estacionamernto'): 'estacionamento', ('Opcional', 'pct'): 'pacote', ('Opcional', 'navegaçãp'): 'navegação', ('Opcional', '\\+'): '', ('Versão', 'bussiness'): 'business', ('Versão', 'r-line'): 'rline', ('Versão', 'confortl'): 'confortline',
('Versão', 'high'): 'highline', ('Opcional', 'p/dsg'): 'para dsg', ('Opcional', 'dianteirostraseiros'): 'dianteiros traseiros', ('Opcional', 'dianteirostras'): 'dianteiros traseiros', ('Opcional', 'diant'): 'dianteiros',
('Opcional', 'dttras'): 'dianteiros traseiros', ('Opcional', 'dttrpark'): 'dianteiros traseiros park', ('Opcional', 'dianttras'): 'dianteiros traseiros', ('Opcional', 'câmara'): 'camara', ('Opcional', 'camera'): 'camara',
('Opcional', 'câmera'): 'camara', ('Versão', 'trendtline'): 'trendline', ('Versão', 'trendtline'): 'trendline', ('Versão', 'confort'): 'confortline', ('Versão', 'conftl'): 'confortline', ('Versão', 'hightline'): 'highline', ('Versão', 'bluem'): 'bluemotion',
('Versão', 'bmt'): 'bluemotion', ('Versão', 'up!bluemotion'): 'up! bluemotion', ('Versão', 'up!bluem'): 'up! bluemotion', ('Versão', 'trendl'): 'trendline', ('Versão', 'conft'): 'confortline', ('Versão', 'highlin'): 'highline',
('Versão', 'confortine'): 'confortline', ('Versão', 'cofrtl'): 'confortline', ('Versão', 'confortlline'): 'confortline', ('Versão', 'highl'): 'highline', ('Modelo', 'up!'): 'up'}
control_prints(df, '1', head=1)
df = string_replacer(df, dict_strings_to_replace) # Replaces the strings mentioned in dict_strings_to_replace which are typos, useless information, etc
control_prints(df, '1b', head=1)
df.dropna(subset=['Cor', 'Colour_Ext_Code', 'Modelo', 'Interior'], axis=0, inplace=True) # Removes all remaining NA's
control_prints(df, '2')
df = new_column_creation(df, [x for x in level_2_optionals_cdsu_options.configuration_parameters_full if x != 'Modelo' and x != 'Combustível'], 0) # Creates new columns filled with zeros, which will be filled in the future
df = total_price(df) # Creates a new column with the total cost for each configuration;
control_prints(df, '3a', head=0)
df = remove_zero_price_total_vhe(df, project_id) # Removes VHE with a price total of 0; ToDo: keep checking up if this is still necessary
control_prints(df, '3b', head=0)
df = remove_rows(df, [df[df.Franchise_Code.str.contains('X')].index], project_id) # This removes VW Commercials Vehicles that aren't supposed to be in this model
df = remove_rows(df, [df[(df.Colour_Ext_Code == ' ') & (df.Cor == ' ')].index], project_id, warning=1)
control_prints(df, '3c')
df = options_scraping_v2(df, level_2_optionals_cdsu_options, 'Modelo') # Scrapes the optionals columns for information regarding the GPS, Auto Transmission, Posterior Parking Sensors, External and Internal colours, Model and Rim's Size
control_prints(df, '3d', head=1, null_analysis_flag=1)
df.loc[df['Combustível'].isin(['Elétrico', 'Híbrido']), 'Motor'] = 'N/A' # Defaults the value of motorization for electric/hybrid cars;
control_prints(df, '4', head=0, save=1)
# df = remove_rows(df, [df[df.Modelo.isnull()].index], project_id, warning=1)
df = remove_columns(df, ['Colour_Ext_Code'], project_id) # This column was only needed for some very specific cases where no Colour_Ext_Code was available;
df.to_csv('dbs/df_cdsu.csv', index=False)
control_prints(df, '5', head=0, save=1)
# project_units_count_checkup(df, 'Nº Stock', level_2_optionals_cdsu_options, sql_check=1)
df = color_replacement(df, level_2_optionals_cdsu_options.colors_to_replace_dict, project_id) # Translates all english colors to portuguese
control_prints(df, '6', head=0, save=1)
df = duplicate_removal(df, subset_col='Nº Stock') # Removes duplicate rows, based on the Stock number. This leaves one line per configuration;
control_prints(df, '7')
df = remove_columns(df, ['Cor', 'Interior', 'Opcional', 'Custo', 'Versão', 'Franchise_Code'], project_id) # Remove columns not needed atm;
# Will probably need to also remove: stock_days, stock_days_norm, and one of the scores
# df = remove_rows(df, [df.loc[df['Local da Venda'] == 'DCV - Viat.Toy Viseu', :].index], project_id) # Removes the vehicles sold here, as they are from another brand (Toyota)
df = margin_calculation(df) # Calculates the margin in percentage of the total price
control_prints(df, '8')
df = score_calculation(df, [level_2_optionals_cdsu_options.stock_days_threshold], level_2_optionals_cdsu_options.margin_threshold, level_2_optionals_cdsu_options.project_id) # Classifies the stockdays and margin based in their respective thresholds in tow classes (0 or 1) and then creates a new_score metric,
control_prints(df, '9')
# where only configurations with 1 in both dimension, have 1 as new_score
# df = new_column_creation(df, ['Local da Venda_v2'], df['Local da Venda'])
# control_prints(df, '10')
# cols_to_group_layer_2 = ['Local da Venda']
# mapping_dictionaries, _ = sql_mapping_retrieval(level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['mappings_temp'], 'Mapped_Value', level_2_optionals_cdsu_options)
# df = sell_place_parametrization(df, 'Local da Venda', 'Local da Venda_Fase2', mapping_dictionaries[2], level_2_optionals_cdsu_options.project_id)
# df = col_group(df, cols_to_group_layer_2[0:2], mapping_dictionaries[0:2], project_id) # Based on the information provided by Manuel some entries were grouped as to remove small groups. The columns grouped are mentioned in cols_to_group, and their respective groups are shown in level_2_optionals_cdsu_options
control_prints(df, '9b, before new features', null_analysis_flag=1)
df = new_features(df, configuration_parameters, project_id) # Creates a series of new features, explained in the provided pdf
control_prints(df, '10, after new_features', null_analysis_flag=1)
# global_variables_saving(df, level_2_optionals_cdsu_options.project_id) # Small functions to save 2 specific global variables which will be needed later
log_record('Checkpoint B.1...', project_id)
# performance_info_append(time.time(), 'checkpoint_b1')
df = column_rename(df, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.keys()), list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.values()))
# sql_inject(df, level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['checkpoint_b_table'], level_2_optionals_cdsu_options, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.values()), truncate=1, check_date=1)
df = column_rename(df, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.values()), list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.keys()))
df = remove_columns(df, ['Date'], project_id)
log_record('Fim Secção B.', project_id)
performance_info_append(time.time(), 'Section_B_End')
return df
def deployment(df, db, view):
performance_info_append(time.time(), 'Section_E_Start')
log_record('Início Secção E...', project_id)
if df is not None:
df['NLR_Code'] = level_2_optionals_cdsu_options.nlr_code
# df = column_rename(df, list(level_2_optionals_cdsu_options.column_sql_renaming.keys()), list(level_2_optionals_cdsu_options.column_sql_renaming.values()))
df = df.rename(columns=level_2_optionals_cdsu_options.column_sql_renaming)
control_prints(df, 'before deployment, after renaming', head=1)
sql_delete(level_2_optionals_cdsu_options.DSN_MLG_PRD, db, view, level_2_optionals_cdsu_options, {'NLR_Code': '{}'.format(level_2_optionals_cdsu_options.nlr_code)})
sql_inject(df, level_2_optionals_cdsu_options.DSN_MLG_PRD, db, view, level_2_optionals_cdsu_options, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.values()), check_date=1)
log_record('Fim Secção E.', project_id)
performance_info_append(time.time(), 'Section_E_End')
return
if __name__ == '__main__':
try:
main()
except Exception as exception:
project_identifier, exception_desc = level_2_optionals_cdsu_options.project_id, str(sys.exc_info()[1])
log_record(exception_desc, project_identifier, flag=2)
error_upload(level_2_optionals_cdsu_options, project_identifier, format_exc(), exception_desc, error_flag=1)
log_record('Falhou - Projeto: {}.'.format(str(project_dict[project_identifier])), project_identifier)