Skip to content

Commit

Permalink
Merge pull request #329 from Marcellocosti/MLfix
Browse files Browse the repository at this point in the history
Conversion to run3 for ML scripts
  • Loading branch information
stefanopolitano authored Dec 17, 2024
2 parents 0fd31ff + 9cd6920 commit 25e8973
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 13 deletions.
30 changes: 25 additions & 5 deletions ML/MLApplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sys
import argparse
import yaml
import matplotlib.pyplot as plt

from hipe4ml.model_handler import ModelHandler
from hipe4ml.tree_handler import TreeHandler
Expand All @@ -22,6 +23,13 @@ def main(): #pylint: disable=too-many-statements, too-many-branches
inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
print('Loading analysis configuration: Done!')

if inputCfg.get('savecfg'):
# Save the YAML file to the folder
if not os.path.isdir(os.path.expanduser(inputCfg['standalone_appl']['output_dir'])):
os.makedirs(os.path.expanduser(inputCfg['standalone_appl']['output_dir']))
with open(f'{os.path.expanduser(inputCfg["standalone_appl"]["output_dir"])}/cfg.yml', 'w') as ymlOutFile:
yaml.dump(inputCfg, ymlOutFile, default_flow_style=False)

PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])]
OutputLabels = [inputCfg['output']['out_labels']['Bkg'],
inputCfg['output']['out_labels']['Prompt']]
Expand Down Expand Up @@ -49,7 +57,7 @@ def main(): #pylint: disable=too-many-statements, too-many-branches
else:
DataHandler = TreeHandler(inputFile, treename)

DataHandler.slice_data_frame('pt_cand', PtBins, True)
DataHandler.slice_data_frame('fPt', PtBins, True)
print(f'Loading and preparing data files {inputFile}: Done!')

print('Applying ML model to dataframes: ...', end='\r')
Expand All @@ -67,10 +75,10 @@ def main(): #pylint: disable=too-many-statements, too-many-branches
if not isinstance(ColumnsToSaveFinal, list):
print('\033[91mERROR: column_to_save_list must be defined!\033[0m')
sys.exit()
if 'inv_mass' not in ColumnsToSaveFinal:
print('\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m')
if 'pt_cand' not in ColumnsToSaveFinal:
print('\033[93mWARNING: pt_cand is not going to be saved in the output dataframe!\033[0m')
if 'fM' not in ColumnsToSaveFinal:
print('\033[93mWARNING: fM is not going to be saved in the output dataframe!\033[0m')
if 'fPt' not in ColumnsToSaveFinal:
print('\033[93mWARNING: fPt is not going to be saved in the output dataframe!\033[0m')
if 'pt_B' in ColumnsToSaveFinal and 'pt_B' not in DataDfPtSel.columns:
ColumnsToSaveFinal.remove('pt_B') # only in MC
DataDfPtSel = DataDfPtSel.loc[:, ColumnsToSaveFinal]
Expand All @@ -80,6 +88,18 @@ def main(): #pylint: disable=too-many-statements, too-many-branches
for Pred, Lab in enumerate(OutputLabels):
DataDfPtSel[f'ML_output_{Lab}'] = yPred[:, Pred]
DataDfPtSel.to_parquet(f'{OutPutDirPt}/{outName}_pT_{PtBin[0]}_{PtBin[1]}_ModelApplied.parquet.gzip')

if inputCfg.get('savedistrs'):
plt.figure(figsize=(10, 6))
for col in DataDfPtSel.columns:
if 'ML_output' in col:
plt.hist(DataDfPtSel[col], bins=100, alpha=0.5, label=col, log=True)
plt.title(f'Distributions of ML Outputs for {outName}')
plt.xlabel('Score')
plt.ylabel('Frequency (log scale)')
plt.legend()
plt.savefig(f"{OutPutDirPt}/{outName}Distrs.pdf", format="pdf", bbox_inches="tight")

del DataDfPtSel
print('Applying ML model to dataframes: Done!')

Expand Down
20 changes: 12 additions & 8 deletions ML/MLClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,8 @@ def data_prep(inputCfg, iBin, PtBin, OutPutDirPt, PromptDf, FDDf, BkgDf): #pylin
yTest = LabelsArray.copy()

TrainTestData = [TrainSet, yTrain, TestSet, yTest]
PromptDfSelForEff = pd.concat([PromptDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 1]], sort=False)
if FDDf.empty:
FDDfSelForEff = pd.DataFrame()
else:
FDDfSelForEff = pd.concat([FDDf.iloc[nCandToKeep:], TestSet[pd.Series(yTest).array == 2]], sort=False)
PromptDfSelForEff = TestSet[yTest == 1]
FDDfSelForEff = pd.DataFrame() if FDDf.empty else TestSet[yTest == 2]
del TotDf

elif dataset_opt == 'max_signal':
Expand Down Expand Up @@ -184,7 +181,7 @@ def train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin): #pylint: disa
ModelHandl.optimize_params_optuna(TrainTestData, OptunaOptConfig, metric,
n_trials=inputCfg['ml']['hyper_par_opt']['ntrials'],
direction=inputCfg['ml']['hyper_par_opt']['direction'],
save_study=f'pT_{PtBin[0]}_{PtBin[1]}')
save_study=f'{OutPutDirPt}/OptunaStudy_pT_{PtBin[0]}_{PtBin[1]}')
OutFileHypPars.close()
sys.stdout = sys.__stdout__
print('Performing hyper-parameters optimisation: Done!')
Expand Down Expand Up @@ -230,7 +227,7 @@ def train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin): #pylint: disa
#_____________________________________________
model_converter = H4MLConverter(ModelHandl)
model_onnx = model_converter.convert_model_onnx(1, len(TrainCols))
model_converter.dump_model_onnx(f'{OutPutDirPt}/XGBoostModel_pT_{PtBin[0]}_{PtBin[1]}_onnx.onnx') # dump the model in ONNX format
model_converter.dump_model_onnx(f'{OutPutDirPt}/ModelHandler_pT_{PtBin[0]}_{PtBin[1]}.onnx') # dump the model in ONNX format
#_____________________________________________
plt.rcParams["figure.figsize"] = (10, 9)
ROCCurveTTFig = plot_utils.plot_roc_train_test(TrainTestData[3], yPredTest, TrainTestData[1], yPredTrain, None,
Expand Down Expand Up @@ -266,7 +263,7 @@ def appl(inputCfg, PtBin, OutPutDirPt, ModelHandl, DataDfPtSel, PromptDfPtSelFor
if not isinstance(df_column_to_save_list, list):
print('\033[91mERROR: df_column_to_save_list must be defined!\033[0m')
sys.exit()
if 'inv_mass' not in df_column_to_save_list:
if 'fM' not in df_column_to_save_list:
print('\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m')
if 'fPt' not in df_column_to_save_list:
print('\033[93mWARNING: fPt is not going to be saved in the output dataframe!\033[0m')
Expand Down Expand Up @@ -318,6 +315,13 @@ def main(): #pylint: disable=too-many-statements
inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader)
print('Loading analysis configuration: Done!')

if inputCfg.get('savecfg'):
# Save the YAML file to the folder
if not os.path.isdir(os.path.expanduser(inputCfg['output']['dir'])):
os.makedirs(os.path.expanduser(inputCfg['output']['dir']))
with open(f'{os.path.expanduser(inputCfg["output"]["dir"])}/cfg.yml', 'w') as ymlOutFile:
yaml.dump(inputCfg, ymlOutFile, default_flow_style=False)

print('Loading and preparing data files: ...', end='\r')
PromptHandler = TreeHandler(inputCfg['input']['prompt'], inputCfg['input']['treename'])
FDHandler = None if inputCfg['input']['FD'] is None else TreeHandler(inputCfg['input']['FD'],
Expand Down

0 comments on commit 25e8973

Please sign in to comment.