From 8828be20f9e71fd1b2bbd7ffbaf3249816e981c3 Mon Sep 17 00:00:00 2001 From: Kelly Date: Mon, 7 Mar 2022 12:13:42 -0500 Subject: [PATCH] adds flake8 and fies linting errors --- code/.DS_Store | Bin 8196 -> 8196 bytes code/requirements.txt | 4 +- code/setup.py | 6 +- code/topcoder_cognitive_state/CONSTANTS.py | 16 +- code/topcoder_cognitive_state/load_data.py | 246 ++++++++++++++------ code/topcoder_cognitive_state/model.py | 223 ++++++++++++++---- code/topcoder_cognitive_state/opt_params.py | 67 +++--- code/topcoder_cognitive_state/processing.py | 216 ++++++++++++----- code/topcoder_cognitive_state/test.py | 89 +++++-- code/topcoder_cognitive_state/train.py | 186 ++++++++++----- code/topcoder_cognitive_state/utils.py | 0 11 files changed, 734 insertions(+), 319 deletions(-) delete mode 100644 code/topcoder_cognitive_state/utils.py diff --git a/code/.DS_Store b/code/.DS_Store index 764de7cec99531b6154b93adae0c5b8d483cfa41..3a12a32269c94bfee3a2c56ff20c94231758e941 100644 GIT binary patch delta 38 ucmZp1XmOa}&nUPtU^hRb;AS3y=S-Vbg+DNEe$1o7yqR6%2g}APR%QVF%M7&u delta 93 zcmZp1XmOa}&nUDpU^hRb&}JTi=S=FH3_%R842}#g48A~Gk0F;K#WN>AIVmSUiGhJZ pfPsPWGmzHU{6gqA^Y59NGW? diff --git a/code/requirements.txt b/code/requirements.txt index 5b51eed..bd3acd8 100644 --- a/code/requirements.txt +++ b/code/requirements.txt @@ -4,4 +4,6 @@ lightgbm==3.3.2 scikit-learn==1.0.2 tqdm==4.62.3 scipy==1.7.3 -optuna==2.10.0 \ No newline at end of file +optuna==2.10.0 +flake8==4.0.1 +black==22.1.0 \ No newline at end of file diff --git a/code/setup.py b/code/setup.py index 4476718..5ac8655 100644 --- a/code/setup.py +++ b/code/setup.py @@ -12,6 +12,8 @@ name="topcoder_cognitive_state", version="0.0.1", packages=find_packages(), - py_modules=[splitext(basename(path))[0] for path in glob("topcoder_cognitive_state/*.py")], + py_modules=[ + splitext(basename(path))[0] for path in glob("topcoder_cognitive_state/*.py") + ], install_requires=requirements, -) \ No newline at end of file +) diff --git a/code/topcoder_cognitive_state/CONSTANTS.py b/code/topcoder_cognitive_state/CONSTANTS.py index b580a3a..98e3a80 100644 --- a/code/topcoder_cognitive_state/CONSTANTS.py +++ b/code/topcoder_cognitive_state/CONSTANTS.py @@ -1,13 +1,11 @@ TARGET2LABEL = { - "low": 0, - "medium": 1, - "high": 2, - "baseline": 3, - "channelized": 4, - "surprise": 5 + "low": 0, + "medium": 1, + "high": 2, + "baseline": 3, + "channelized": 4, + "surprise": 5, } LABEL2TARGET = dict(zip(TARGET2LABEL.values(), TARGET2LABEL.keys())) - - METADATA_COLUMNS = ["test_suite"] -NAN_VALUES = [-9999.9] \ No newline at end of file +NAN_VALUES = [-9999.9] diff --git a/code/topcoder_cognitive_state/load_data.py b/code/topcoder_cognitive_state/load_data.py index 4add9bd..1c41aea 100644 --- a/code/topcoder_cognitive_state/load_data.py +++ b/code/topcoder_cognitive_state/load_data.py @@ -1,97 +1,171 @@ -import os -from multiprocessing import Pool +from typing import Tuple import time -import pandas as pd +import pandas as pd from tqdm import tqdm from topcoder_cognitive_state.CONSTANTS import METADATA_COLUMNS, NAN_VALUES -def _test_missing_features(df): +def _test_missing_features(df: pd.DataFrame) -> pd.DataFrame: + """ + This test contains three tests which are run manually: + + 1. Check if some columns are missing + 2. Check if some columns have None values + 3. Check if some columns have -9999.9 (missing) values + """ # cols = ['ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z'] # cols = ['Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6'] # cols = ['Polar_bpm', 'Polar_hrv', 'tlx_score'] cols = [ # features - 'tlx_score', 'E4_BVP', 'E4_GSR', 'LooxidLink_EEG_A3', 'LooxidLink_EEG_A4', - 'LooxidLink_EEG_FP1', 'LooxidLink_EEG_FP2', 'LooxidLink_EEG_A7', 'LooxidLink_EEG_A8', - - 'Muse_EEG_TP9', 'Muse_EEG_AF7', 'Muse_EEG_AF8', 'Muse_EEG_TP10', - 'Muse_PPG_0', 'Muse_PPG_1', 'Muse_PPG_2', - - 'Myo_GYR_X', 'Myo_GYR_Y', 'Myo_GYR_Z', - 'Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6', 'Myo_EMG_7', - - 'PICARD_fnirs_0', 'PICARD_fnirs_1', - - 'Polar_bpm', 'Polar_hrv', - - 'ViveEye_pupilPos_L_X', 'ViveEye_pupilPos_L_Y', - 'ViveEye_pupilPos_R_X', 'ViveEye_pupilPos_R_Y', - - 'ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z', - 'ViveEye_gazeOrigin_R_X', 'ViveEye_gazeOrigin_R_Y', 'ViveEye_gazeOrigin_R_Z', - 'ViveEye_gazeDirection_L_X', 'ViveEye_gazeDirection_L_Y', 'ViveEye_gazeDirection_L_Z', - 'ViveEye_gazeDirection_R_X', 'ViveEye_gazeDirection_R_Y', 'ViveEye_gazeDirection_R_Z', - - 'ViveEye_eyeOpenness_L', 'ViveEye_pupilDiameter_L', - 'ViveEye_eyeOpenness_R', 'ViveEye_pupilDiameter_R', - - 'Zephyr_HR', 'Zephyr_HRV', + "tlx_score", + "E4_BVP", + "E4_GSR", + "LooxidLink_EEG_A3", + "LooxidLink_EEG_A4", + "LooxidLink_EEG_FP1", + "LooxidLink_EEG_FP2", + "LooxidLink_EEG_A7", + "LooxidLink_EEG_A8", + "Muse_EEG_TP9", + "Muse_EEG_AF7", + "Muse_EEG_AF8", + "Muse_EEG_TP10", + "Muse_PPG_0", + "Muse_PPG_1", + "Muse_PPG_2", + "Myo_GYR_X", + "Myo_GYR_Y", + "Myo_GYR_Z", + "Myo_EMG_0", + "Myo_EMG_1", + "Myo_EMG_2", + "Myo_EMG_3", + "Myo_EMG_4", + "Myo_EMG_5", + "Myo_EMG_6", + "Myo_EMG_7", + "PICARD_fnirs_0", + "PICARD_fnirs_1", + "Polar_bpm", + "Polar_hrv", + "ViveEye_pupilPos_L_X", + "ViveEye_pupilPos_L_Y", + "ViveEye_pupilPos_R_X", + "ViveEye_pupilPos_R_Y", + "ViveEye_gazeOrigin_L_X", + "ViveEye_gazeOrigin_L_Y", + "ViveEye_gazeOrigin_L_Z", + "ViveEye_gazeOrigin_R_X", + "ViveEye_gazeOrigin_R_Y", + "ViveEye_gazeOrigin_R_Z", + "ViveEye_gazeDirection_L_X", + "ViveEye_gazeDirection_L_Y", + "ViveEye_gazeDirection_L_Z", + "ViveEye_gazeDirection_R_X", + "ViveEye_gazeDirection_R_Y", + "ViveEye_gazeDirection_R_Z", + "ViveEye_eyeOpenness_L", + "ViveEye_pupilDiameter_L", + "ViveEye_eyeOpenness_R", + "ViveEye_pupilDiameter_R", + "Zephyr_HR", + "Zephyr_HRV", ] # case 1 - no column - # df = df.drop(cols, axis=1) + # df = df.drop(cols, axis=1) # case 2 - None values - #for col in cols: + # for col in cols: # df[col] = None - # case 3 - missing values + # case 3 - missing values for col in cols: df[col] = -9999.9 return df -def read_and_prepare_data_chunk(df): +def read_and_prepare_data_chunk(df: pd.DataFrame) -> pd.DataFrame: + """ + Read raw data and prepare it for processing. + I.e., create columns if they are missing, + replace missing values with None, + etc. + + Args: + df (pd.DataFrame): input raw data + + Returns: + pd.DataFrame: processed data + """ EXPECTED_COLUMNS = [ # features - 'tlx_score', 'E4_BVP', 'E4_GSR', 'LooxidLink_EEG_A3', 'LooxidLink_EEG_A4', - 'LooxidLink_EEG_FP1', 'LooxidLink_EEG_FP2', 'LooxidLink_EEG_A7', 'LooxidLink_EEG_A8', - - 'Muse_EEG_TP9', 'Muse_EEG_AF7', 'Muse_EEG_AF8', 'Muse_EEG_TP10', - 'Muse_PPG_0', 'Muse_PPG_1', 'Muse_PPG_2', - - 'Myo_GYR_X', 'Myo_GYR_Y', 'Myo_GYR_Z', - 'Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6', 'Myo_EMG_7', - - 'PICARD_fnirs_0', 'PICARD_fnirs_1', - - 'Polar_bpm', 'Polar_hrv', - - 'ViveEye_pupilPos_L_X', 'ViveEye_pupilPos_L_Y', - 'ViveEye_pupilPos_R_X', 'ViveEye_pupilPos_R_Y', - - 'ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z', - 'ViveEye_gazeOrigin_R_X', 'ViveEye_gazeOrigin_R_Y', 'ViveEye_gazeOrigin_R_Z', - 'ViveEye_gazeDirection_L_X', 'ViveEye_gazeDirection_L_Y', 'ViveEye_gazeDirection_L_Z', - 'ViveEye_gazeDirection_R_X', 'ViveEye_gazeDirection_R_Y', 'ViveEye_gazeDirection_R_Z', - - 'ViveEye_eyeOpenness_L', 'ViveEye_pupilDiameter_L', - 'ViveEye_eyeOpenness_R', 'ViveEye_pupilDiameter_R', - - 'Zephyr_HR', 'Zephyr_HRV', - + "tlx_score", + "E4_BVP", + "E4_GSR", + "LooxidLink_EEG_A3", + "LooxidLink_EEG_A4", + "LooxidLink_EEG_FP1", + "LooxidLink_EEG_FP2", + "LooxidLink_EEG_A7", + "LooxidLink_EEG_A8", + "Muse_EEG_TP9", + "Muse_EEG_AF7", + "Muse_EEG_AF8", + "Muse_EEG_TP10", + "Muse_PPG_0", + "Muse_PPG_1", + "Muse_PPG_2", + "Myo_GYR_X", + "Myo_GYR_Y", + "Myo_GYR_Z", + "Myo_EMG_0", + "Myo_EMG_1", + "Myo_EMG_2", + "Myo_EMG_3", + "Myo_EMG_4", + "Myo_EMG_5", + "Myo_EMG_6", + "Myo_EMG_7", + "PICARD_fnirs_0", + "PICARD_fnirs_1", + "Polar_bpm", + "Polar_hrv", + "ViveEye_pupilPos_L_X", + "ViveEye_pupilPos_L_Y", + "ViveEye_pupilPos_R_X", + "ViveEye_pupilPos_R_Y", + "ViveEye_gazeOrigin_L_X", + "ViveEye_gazeOrigin_L_Y", + "ViveEye_gazeOrigin_L_Z", + "ViveEye_gazeOrigin_R_X", + "ViveEye_gazeOrigin_R_Y", + "ViveEye_gazeOrigin_R_Z", + "ViveEye_gazeDirection_L_X", + "ViveEye_gazeDirection_L_Y", + "ViveEye_gazeDirection_L_Z", + "ViveEye_gazeDirection_R_X", + "ViveEye_gazeDirection_R_Y", + "ViveEye_gazeDirection_R_Z", + "ViveEye_eyeOpenness_L", + "ViveEye_pupilDiameter_L", + "ViveEye_eyeOpenness_R", + "ViveEye_pupilDiameter_R", + "Zephyr_HR", + "Zephyr_HRV", # target - "induced_state" + "induced_state", ] + # uncomment to enable test # df = _test_missing_features(df) # test_suite - if 'test_suite' not in df.columns: - df['test_suite'] = "test" + if "test_suite" not in df.columns: + df["test_suite"] = "test" df["time"] = pd.to_datetime(df["time"], unit="us") df["timestamp"] = df["time"].dt.round("1s") @@ -118,30 +192,48 @@ def read_and_prepare_data_chunk(df): return ags -def get_dummy_template(df): +def get_dummy_template(df: pd.DataFrame) -> pd.DataFrame: + """ + The template is needed to match the expected sample submission format. + """ df["time"] = pd.to_datetime(df["time"], unit="us") df["timestamp"] = df["time"].dt.round("1s") df = df.drop("time", axis=1) dummy_template = df.drop_duplicates( - subset=METADATA_COLUMNS + ["timestamp"], - keep="first" + subset=METADATA_COLUMNS + ["timestamp"], keep="first" ).reset_index(drop=True) dummy_template = dummy_template[METADATA_COLUMNS + ["timestamp"]] return dummy_template -def get_needed_data(df): +def get_needed_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Read data for training/testing and prepare template format for submission + + Return: + res1 - pd.DataFrame - read data + res2 - pd.DataFrame - template for submission + """ res1 = read_and_prepare_data_chunk(df) res2 = get_dummy_template(df) - return [res1, res2] + return res1, res2 def read_data( - path_to_data: str, - debug: bool = False -) -> pd.DataFrame: + path_to_data: str, debug: bool = False +) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Read data. The data is read in chunks to reduce memory consumption. + + Args: + path_to_data (str): path to data + debug (bool, optional): run data loading on a sample of data. Defaults to False. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame]: Read data and prepared template for submission + """ t_start = time.time() - chunksize = 10 ** 6 + chunksize = 10**6 if path_to_data is None: path_to_data = "./data/training-data.zip" @@ -152,11 +244,9 @@ def read_data( else: nrows = None + # create chunks iterator to read data chunks = pd.read_csv( - path_to_data, - na_values=NAN_VALUES, - chunksize=chunksize, - nrows=nrows + path_to_data, na_values=NAN_VALUES, chunksize=chunksize, nrows=nrows ) # get data for processing @@ -167,12 +257,14 @@ def read_data( res = [i[0] for i in full_result] res = pd.concat(res, axis=0) res = res.sort_index() - res = res[~res.index.duplicated(keep='first')] + res = res[~res.index.duplicated(keep="first")] # collect dummies for sub res2 = [i[1] for i in full_result] res2 = pd.concat(res2, axis=0) - res2 = res2.drop_duplicates(subset=METADATA_COLUMNS + ["timestamp"], keep="first").reset_index(drop=True) + res2 = res2.drop_duplicates( + subset=METADATA_COLUMNS + ["timestamp"], keep="first" + ).reset_index(drop=True) t_end = time.time() print(f"Data is read. Time per reading: {(t_end-t_start)/60:.2f} minutes") return res, res2 diff --git a/code/topcoder_cognitive_state/model.py b/code/topcoder_cognitive_state/model.py index cfe5bf6..fcd3527 100644 --- a/code/topcoder_cognitive_state/model.py +++ b/code/topcoder_cognitive_state/model.py @@ -1,25 +1,140 @@ from typing import List, Tuple -from functools import reduce - import pandas as pd import numpy as np -from scipy import stats +from lightgbm import LGBMClassifier + +from topcoder_cognitive_state.processing import FeaturesGenerator + + +ALLOWED_IMPORTANT_FEATURES = set( + [ + "E4_BVP", + "E4_GSR", + "LooxidLink_EEG_A3", + "LooxidLink_EEG_A4", + "LooxidLink_EEG_FP1", + "LooxidLink_EEG_FP2", + "LooxidLink_EEG_A7", + "LooxidLink_EEG_A8", + "Muse_EEG_TP9", + "Muse_EEG_AF7", + "Muse_EEG_AF8", + "Muse_EEG_TP10", + "Muse_PPG_0", + "Muse_PPG_1", + "Muse_PPG_2", + "Myo_GYR_X", + "Myo_GYR_Y", + "Myo_GYR_Z", + "Myo_EMG_0", + "Myo_EMG_1", + "Myo_EMG_2", + "Myo_EMG_3", + "Myo_EMG_4", + "Myo_EMG_5", + "Myo_EMG_6", + "Myo_EMG_7", + "PICARD_fnirs_0", + "PICARD_fnirs_1", + "Polar_bpm", + "Polar_hrv", + "ViveEye_eyeOpenness_L", + "ViveEye_pupilDiameter_L", + "ViveEye_pupilPos_L_X", + "ViveEye_pupilPos_L_Y", + "ViveEye_gazeOrigin_L_X", + "ViveEye_gazeOrigin_L_Y", + "ViveEye_gazeOrigin_L_Z", + "ViveEye_gazeDirection_L_X", + "ViveEye_gazeDirection_L_Y", + "ViveEye_gazeDirection_L_Z", + "ViveEye_eyeOpenness_R", + "ViveEye_pupilDiameter_R", + "ViveEye_pupilPos_R_X", + "ViveEye_pupilPos_R_Y", + "ViveEye_gazeOrigin_R_X", + "ViveEye_gazeOrigin_R_Y", + "ViveEye_gazeOrigin_R_Z", + "ViveEye_gazeDirection_R_X", + "ViveEye_gazeDirection_R_Y", + "ViveEye_gazeDirection_R_Z", + "Zephyr_HR", + "Zephyr_HRV", + ] +) + + +class Model: + def __init__( + self, + features: List[str], + preprocessor: FeaturesGenerator, + models_1: List[LGBMClassifier], + models_3: List[LGBMClassifier], + ): + """ + Model + + Args: + features (List[str]): list of features which were used during training + preprocessor (FeaturesGenerator): preprocessor + models_1 (List[LGBMClassifier]): list model models for t predictions + models_3 (List[LGBMClassifier]): list model models for t+3 predictions + """ + self.features = features + self.preprocessor = preprocessor + self.models_1 = models_1 + self.models_3 = models_3 + self.num_classes = 6 + + def ensure_features(self, x: pd.DataFrame) -> pd.DataFrame: + """ + Transform dataframe into the same format which was used during training. + I.e., ensure that all features are present. If a feature is missing - replace it with 0. + Ensure order of features in the dataframe. + + Args: + x (pd.DataFrame): input data + + Returns: + pd.DataFrame: processed data + """ + for col in self.features: + if col not in x.columns: + x[col] = 0 + x = x[self.features] + return x + + def predict( + self, x_raw: pd.DataFrame + ) -> Tuple[np.array, np.array, np.array, np.array, List[List[str]]]: + """ + Make predictions for t and t+3, find the most important features + Args: + x_raw (pd.DataFrame): raw input data -ALLOWED_IMPORTANT_FEATURES = set(['E4_BVP', 'E4_GSR', 'LooxidLink_EEG_A3', 'LooxidLink_EEG_A4', 'LooxidLink_EEG_FP1', 'LooxidLink_EEG_FP2', - 'LooxidLink_EEG_A7', 'LooxidLink_EEG_A8', 'Muse_EEG_TP9', 'Muse_EEG_AF7', 'Muse_EEG_AF8', 'Muse_EEG_TP10', - 'Muse_PPG_0', 'Muse_PPG_1', 'Muse_PPG_2', 'Myo_GYR_X', 'Myo_GYR_Y', 'Myo_GYR_Z', 'Myo_EMG_0', 'Myo_EMG_1', - 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6', 'Myo_EMG_7', 'PICARD_fnirs_0', 'PICARD_fnirs_1', - 'Polar_bpm', 'Polar_hrv', 'ViveEye_eyeOpenness_L', 'ViveEye_pupilDiameter_L', 'ViveEye_pupilPos_L_X', - 'ViveEye_pupilPos_L_Y', 'ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z', - 'ViveEye_gazeDirection_L_X', 'ViveEye_gazeDirection_L_Y', 'ViveEye_gazeDirection_L_Z', 'ViveEye_eyeOpenness_R', - 'ViveEye_pupilDiameter_R', 'ViveEye_pupilPos_R_X', 'ViveEye_pupilPos_R_Y', 'ViveEye_gazeOrigin_R_X', - 'ViveEye_gazeOrigin_R_Y', 'ViveEye_gazeOrigin_R_Z', 'ViveEye_gazeDirection_R_X', 'ViveEye_gazeDirection_R_Y', - 'ViveEye_gazeDirection_R_Z', 'Zephyr_HR', 'Zephyr_HRV']) + Returns: + Tuple[np.array, np.array, np.array, np.array, List[List[str]]]: predictions and most important features + """ + # generate features + x, _, _, group = self.preprocessor.generate_features(x_raw, get_targers=False) + + # make predictions + y_hat_1, most_important_features = make_predictions(x, group, self.models_1) + y_hat_3 = y_hat_1 + + # transform predictions into labels + y_hat_1_label = self.preprocessor.apply_label2target(np.argmax(y_hat_1, axis=1)) + y_hat_3_label = y_hat_1_label + return y_hat_1, y_hat_3, y_hat_1_label, y_hat_3_label, most_important_features -def sum_arrays(arrs): +def sum_arrays(arrs: List[np.array]) -> np.array: + """ + Calculate sum of list of arrays + """ x = arrs[0] if len(arrs) == 1: return x @@ -28,21 +143,49 @@ def sum_arrays(arrs): return x -def mean_arrays(arrs): +def mean_arrays(arrs) -> np.array: + """ + Calculate the mean of the list of arrays + """ return sum_arrays(arrs) / len(arrs) -def postprocess_preds(preds, group): +def postprocess_preds(preds: np.array, group: pd.Series) -> pd.DataFrame: + """ + Smooth predictions by running rolling mean within the group + + Args: + preds (np.array): array of predicted probs + group (pd.Series): group id + + Returns: + pd.DataFrame: smoothed predicted probs + """ df = pd.DataFrame(preds) - df['group'] = group - res = df.groupby('group').rolling(window=999_999, min_periods=1).mean().values + df["group"] = group + res = df.groupby("group").rolling(window=999_999, min_periods=1).mean().values return res -def make_predictions(data: pd.DataFrame, group: pd.Series, models) -> Tuple[np.array, List[List[str]]]: +def make_predictions( + data: pd.DataFrame, group: pd.Series, models: List[LGBMClassifier] +) -> Tuple[np.array, List[List[str]]]: + """ + Make predictions and calculate the most important features + + Args: + data (pd.DataFrame): input raw data + group (pd.Series): group id for each row in input data + models (List[Model]): list of models. Final predictions is + avg prediction of all models predictions. + + Returns: + Tuple[np.array, List[List[str]]]: predictions and most important features + """ y_hat = [] shap_values = [] for model in models: + # make predictions and calculate SHAP importance y_hat_pred = model.predict_proba(data) contribs = model.predict_proba(data, pred_contrib=True) @@ -50,24 +193,30 @@ def make_predictions(data: pd.DataFrame, group: pd.Series, models) -> Tuple[np.a # drop shap sum column: (n_samples, (n_features + 1) * n_classes) -> n_samples, n_features * n_classes y_shap_pred = np.vstack(contribs) n_features = data.shape[1] - indexes = [i for i in range(y_shap_pred.shape[1]) if (i // n_features > 0) and (i % n_features == 1)] + indexes = [ + i + for i in range(y_shap_pred.shape[1]) + if (i // n_features > 0) and (i % n_features == 1) + ] y_shap_pred = y_shap_pred[:, indexes] + # smooth predictions and shap importance using group id y_hat_pred = postprocess_preds(y_hat_pred, group) y_shap_pred = postprocess_preds(y_shap_pred, group) y_hat.append(y_hat_pred) shap_values.append(y_shap_pred) - + + # calculate mean for predictions and SHAP importance y_hat = mean_arrays(y_hat) shap_values = mean_arrays(shap_values) - # select most important featuers using shap values + # select most important featuers from list of allowed features most_important_features = [] topn_featuers = 3 for i in range(shap_values.shape[0]): row = shap_values[i, :] - ind_argsorted = np.argsort(row)[::-1] # high -> low + ind_argsorted = np.argsort(row)[::-1] # high -> low tmp = [] for j in ind_argsorted: feature = list(data.columns)[j % n_features] @@ -77,31 +226,3 @@ def make_predictions(data: pd.DataFrame, group: pd.Series, models) -> Tuple[np.a tmp = tmp[:topn_featuers] most_important_features.append(tmp) return y_hat, most_important_features - - -class Model: - def __init__(self, features: List[str], preprocessor, models_1, models_3): - self.features = features - self.preprocessor = preprocessor - self.models_1 = models_1 - self.models_3 = models_3 - self.num_classes = 6 - - def ensure_features(self, x): - for col in self.features: - if col not in x.columns: - x[col] = 0 - x = x[self.features] - return x - - def predict(self, x_raw): - x, _, _, group = self.preprocessor.generate_featres(x_raw, get_targers=False) - - # get probs - y_hat_1, most_important_features = make_predictions(x, group, self.models_1) - y_hat_3 = y_hat_1 - - # get labels - y_hat_1_label = self.preprocessor.apply_label2target(np.argmax(y_hat_1, axis=1)) - y_hat_3_label = y_hat_1_label - return y_hat_1, y_hat_3, y_hat_1_label, y_hat_3_label, most_important_features \ No newline at end of file diff --git a/code/topcoder_cognitive_state/opt_params.py b/code/topcoder_cognitive_state/opt_params.py index ecb1e4c..019fccb 100644 --- a/code/topcoder_cognitive_state/opt_params.py +++ b/code/topcoder_cognitive_state/opt_params.py @@ -1,33 +1,26 @@ -from typing import List, Tuple import sys import warnings -warnings.filterwarnings('ignore') -import copy +import copy import logging - -import pickle -from functools import reduce - import optuna -import pandas as pd -from sklearn.metrics import roc_auc_score -import numpy as np -from tqdm import tqdm -from lightgbm import LGBMClassifier from topcoder_cognitive_state.load_data import read_data from topcoder_cognitive_state.processing import FeaturesGenerator -from sklearn.model_selection import StratifiedGroupKFold -from topcoder_cognitive_state.model import Model +from topcoder_cognitive_state.model import Model # noqa: F401 from topcoder_cognitive_state.train import train_models +warnings.filterwarnings("ignore") + def main(): + """ + Optimize model's hyperparams using optuna + """ if len(sys.argv) < 2 or len(sys.argv[1]) == 0: print("Training input file is missing.") return 1 - + if len(sys.argv) < 3 or len(sys.argv[2]) == 0: print("Path to log is missing") return 1 @@ -39,14 +32,14 @@ def main(): handlers=[logging.FileHandler(path_to_log, mode="w"), logging.StreamHandler()], ) - print('Training started.') - + print("Training started.") + input_file = sys.argv[1] - output_file = sys.argv[2] + _ = sys.argv[2] data, _ = read_data(input_file) processor = FeaturesGenerator() - X, Y1, Y3, META = processor.generate_featres_train(data) + X, Y1, Y3, META = processor.generate_features_train(data) default_params = { "num_leaves": 127, @@ -58,25 +51,28 @@ def main(): "colsample_bytree": 0.67, "reg_alpha": 1.0, "reg_lambda": 1.0, - 'random_state': 42 + "random_state": 42, } def objective(trial): params = copy.deepcopy(default_params) - params.update({ - "num_leaves": trial.suggest_int('num_leaves', 7, 255, 8), - "max_depth": trial.suggest_int('max_depth', 3, 14, 1), - - "min_child_weight": trial.suggest_loguniform('min_child_weight', 1e-18, 1), - "min_child_samples": trial.suggest_int('min_child_samples', 1, 100, 1, log=True), - "min_split_gain": trial.suggest_loguniform('min_split_gain', 1e-18, 1), - - "subsample": trial.suggest_float('subsample', 0.1, 1.0), - "colsample_bytree": trial.suggest_float('colsample_bytree', 0.1, 1.0), - - "reg_alpha": trial.suggest_float('reg_alpha', 0.0, 10), - "reg_lambda": trial.suggest_float('reg_lambda', 0.0, 10), - }) + params.update( + { + "num_leaves": trial.suggest_int("num_leaves", 7, 255, 8), + "max_depth": trial.suggest_int("max_depth", 3, 14, 1), + "min_child_weight": trial.suggest_loguniform( + "min_child_weight", 1e-18, 1 + ), + "min_child_samples": trial.suggest_int( + "min_child_samples", 1, 100, 1, log=True + ), + "min_split_gain": trial.suggest_loguniform("min_split_gain", 1e-18, 1), + "subsample": trial.suggest_float("subsample", 0.1, 1.0), + "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0), + "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10), + "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10), + } + ) _, _, test_score = train_models(X, Y1, Y3, META, params_to_train=[params]) logging.info(f"Next itter score - {test_score}") @@ -96,5 +92,6 @@ def objective(trial): for key, value in best_trial.params.items(): logging.info(" {}: {}".format(key, value)) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/code/topcoder_cognitive_state/processing.py b/code/topcoder_cognitive_state/processing.py index 3c7f804..b6b7d35 100644 --- a/code/topcoder_cognitive_state/processing.py +++ b/code/topcoder_cognitive_state/processing.py @@ -1,95 +1,152 @@ +from typing import List, Tuple import re -import pandas as pd +import pandas as pd import numpy as np from tqdm import tqdm from topcoder_cognitive_state.CONSTANTS import TARGET2LABEL, LABEL2TARGET -def get_distance(x1, x2): +def get_distance(x1: List[pd.Series], x2: List[pd.Series]) -> pd.Series: + """ + Calculate l2 distance between points + + Args: + x1 (List[pd.Series]): list of point x1 coordinates, e.g. X1, Y1, Z1 + x2 (List[pd.Series]): list of point x2 coordinates, e.g. X2, Y2, Z2 + + Returns: + pd.Series: l2 distance + """ delta = 0 - for a1, a2 in zip(x1,x2): + for a1, a2 in zip(x1, x2): delta += (a1 - a2) ** 2 delta = delta ** (1 / len(x1)) return delta class FeaturesGenerator: - def __init__(self, target_column="induced_state"): + def __init__(self, target_column: str = "induced_state"): + """ + Generate features using raw data + + Args: + target_column (str, optional): target column. Defaults to "induced_state". + """ self.target_column = target_column - + self.target2label = TARGET2LABEL self.label2target = LABEL2TARGET - def rename_cols(self, x): - x = x.rename(columns = lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) + def rename_cols(self, x: pd.DataFrame): + """ + Lightgbm doesn't work well with all features names, so we need to rename some of them.""" + x = x.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x)) return x - def get_targets(self, x, future: int = 1): + def get_targets(self, x: pd.DataFrame, future: int = 1) -> pd.Series: + """ + Create training targets + + Args: + x (pd.DataFrame): input data + future (int, optional): steps to look into future. + Future=1 equals the current moment (t). + Future=2 equals to the next moment (t+1). + Defaults to 1. + + Returns: + pd.Series: target + """ if future == 0: y = x[self.target_column] else: - y = x.groupby("session_id")[self.target_column].shift(-1 * future).fillna(method='ffill') + y = ( + x.groupby("session_id")[self.target_column] + .shift(-1 * future) + .fillna(method="ffill") # replace missing targets with previous value + ) ind = y.isnull() y[ind] = x.loc[ind, self.target_column] y_label = self.apply_target2label(y) return y_label - - def apply_target2label(self, y): + + def apply_target2label(self, y: pd.Series) -> pd.Series: + # 'low' -> 0, 'medium' -> 1, etc return pd.Series(y).map(self.target2label) - - def apply_label2target(self, y): + + def apply_label2target(self, y: pd.Series) -> pd.Series: + # -> 'low', 1 -> 'medium', etc return pd.Series(y).map(self.label2target) - def calc_eyes_distances(self, x): + def calc_eyes_distances(self, x: pd.DataFrame) -> pd.DataFrame: + """ + Calculate features based on raw features from the eyes tracker. + + Args: + x (pd.DataFrame): input data + + Returns: + pd.DataFrame: new dataframe with new features + """ new_features = pd.DataFrame({}) shifts = [1, 3] eyes = ["L", "R"] # L / R eyes new_features["ViveEye_pupilPos_LR_distance"] = get_distance( - (x["ViveEye_pupilPos_L_X"], x["ViveEye_pupilPos_L_Y"]), - (x["ViveEye_pupilPos_R_X"], x["ViveEye_pupilPos_R_Y"]) - ) + (x["ViveEye_pupilPos_L_X"], x["ViveEye_pupilPos_L_Y"]), + (x["ViveEye_pupilPos_R_X"], x["ViveEye_pupilPos_R_Y"]), + ) # distances for s in shifts: for pos in eyes: new_features[f"ViveEye_pupilPos_distance_{s}_{pos}"] = get_distance( + (x[f"ViveEye_pupilPos_{pos}_X"], x[f"ViveEye_pupilPos_{pos}_Y"]), ( - x[f"ViveEye_pupilPos_{pos}_X"], - x[f"ViveEye_pupilPos_{pos}_Y"] - ), - ( - x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_X"].shift(s).values, - x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_Y"].shift(s).values - ) + x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_X"] + .shift(s) + .values, + x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_Y"] + .shift(s) + .values, + ), ) - + # L / R eyes for feature in ["ViveEye_gazeOrigin", "ViveEye_gazeDirection"]: new_features["{feature}_LR_distance"] = get_distance( - (x[f"{feature}_L_X"], x[f"{feature}_L_Y"], x[f"{feature}_L_Z"]), - (x[f"{feature}_R_X"], x[f"{feature}_R_Y"], x[f"{feature}_R_Z"]) + (x[f"{feature}_L_X"], x[f"{feature}_L_Y"], x[f"{feature}_L_Z"]), + (x[f"{feature}_R_X"], x[f"{feature}_R_Y"], x[f"{feature}_R_Z"]), ) for s in shifts: for pos in eyes: new_features[f"distance_{feature}_{s}_{pos}"] = get_distance( ( - x[f"{feature}_{pos}_X"], - x[f"{feature}_{pos}_Y"], - x[f"{feature}_{pos}_Z"] - ), + x[f"{feature}_{pos}_X"], + x[f"{feature}_{pos}_Y"], + x[f"{feature}_{pos}_Z"], + ), ( - x.groupby("session_id")[f"{feature}_{pos}_X"].shift(s).values, - x.groupby("session_id")[f"{feature}_{pos}_Y"].shift(s).values, - x.groupby("session_id")[f"{feature}_{pos}_Z"].shift(s).values - ), + x.groupby("session_id")[f"{feature}_{pos}_X"] + .shift(s) + .values, + x.groupby("session_id")[f"{feature}_{pos}_Y"] + .shift(s) + .values, + x.groupby("session_id")[f"{feature}_{pos}_Z"] + .shift(s) + .values, + ), ) return new_features - def get_session_id_and_time_since_break(self, data): + def get_session_id_and_time_since_break(self, data: pd.DataFrame) -> pd.DataFrame: + """ + Calculate session id and time since the last break. + """ tmp = data.copy().reset_index() diff = tmp["timestamp"].diff().dt.total_seconds() tmp["session_id"] = np.cumsum(diff > 1) @@ -97,11 +154,23 @@ def get_session_id_and_time_since_break(self, data): data["session_id"] = tmp["session_id"].values return data - def generate_featres(self, x, get_targers: bool = False): + def generate_features( + self, x: pd.DataFrame, get_targers: bool = False + ) -> Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]: + """ + Process raw data and create new features for training + + Args: + x (pd.DataFrame): raw input data + get_targers (bool, optional): generate targets? Needed for training. Defaults to False. + + Returns: + Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]: output + """ x = self.get_session_id_and_time_since_break(x) x = x.reset_index(drop=True) - # 1 and 3 seconds + # create targets for t and t+3 models if get_targers: y1 = self.get_targets(x, future=0) y3 = self.get_targets(x, future=3) @@ -113,42 +182,63 @@ def generate_featres(self, x, get_targers: bool = False): # additional featurers for c1, c2 in [ - ('Zephyr_HR', 'Zephyr_HRV'), - ('Polar_bpm', 'Polar_hrv'), - ('Zephyr_HRV', 'Polar_hrv'), + ("Zephyr_HR", "Zephyr_HRV"), + ("Polar_bpm", "Polar_hrv"), + ("Zephyr_HRV", "Polar_hrv"), ]: x[f"{c1}_div_{c2}"] = x[c1] / x[c2] dfs = [x] - # rolling stats + # create rolling stats features windows = [5, 999_999] cols = list(x.columns) for w in windows: - rolling_mean = x.groupby("session_id")[cols].rolling(min_periods=1, window=w).mean().reset_index(drop=True) - rolling_std = x.groupby("session_id")[cols].rolling(min_periods=1, window=w).std().reset_index(drop=True) + rolling_mean = ( + x.groupby("session_id")[cols] + .rolling(min_periods=1, window=w) + .mean() + .reset_index(drop=True) + ) + rolling_std = ( + x.groupby("session_id")[cols] + .rolling(min_periods=1, window=w) + .std() + .reset_index(drop=True) + ) normed = (x - rolling_mean) / (rolling_std + 1) - + normed = normed.add_prefix(f"normed_by_session_{w}_") rolling_mean = rolling_mean.add_prefix(f"mean_by_session_{w}_") rolling_std = rolling_std.add_prefix(f"std_by_session_{w}_") - + dfs += [rolling_mean, rolling_std, normed] + # create global stats features windows = [999_999] - global_cols = cols # + global_cols = cols for w in windows: - rolling_mean = x[global_cols].rolling(min_periods=2, window=w).mean().reset_index(drop=True) - rolling_std = x[global_cols].rolling(min_periods=2, window=w).std().reset_index(drop=True) + rolling_mean = ( + x[global_cols] + .rolling(min_periods=2, window=w) + .mean() + .reset_index(drop=True) + ) + rolling_std = ( + x[global_cols] + .rolling(min_periods=2, window=w) + .std() + .reset_index(drop=True) + ) normed = (x - rolling_mean) / (rolling_std + 1) - + normed = normed.add_prefix(f"normed_global_{w}_") rolling_mean = rolling_mean.add_prefix(f"mean_global_{w}_") rolling_std = rolling_std.add_prefix(f"std_global_{w}_") - + dfs += [rolling_mean, rolling_std, normed] - # shift features + # shift features for s in [1, 3]: gr_s = x.groupby("session_id")[cols].shift(s).reset_index(drop=True) tmp = x - gr_s @@ -164,7 +254,15 @@ def generate_featres(self, x, get_targers: bool = False): df = self.rename_cols(df) return df, y1, y3, session_id - def generate_featres_train(self, data): + def generate_features_train( + self, data: pd.DataFrame + ) -> Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]: + """ + Generate features and targets for training + + Args: + data (pd.DataFrame): raw input data + """ # last index is time indexes = [i[:-1] for i in data.index] indexes = list(set(indexes)) @@ -174,14 +272,14 @@ def generate_featres_train(self, data): for index in tqdm(indexes): index_data_to_select = index # use full index x = data.loc[index_data_to_select] - - r, y1, y3, session_id = self.generate_featres(x, get_targers=True) - + + r, y1, y3, session_id = self.generate_features(x, get_targers=True) + X.append(r) Y1.append(y1) Y3.append(y3) - - # group id = person + task + + # group id = person + task task = pd.Series([index] * r.shape[0]) _meta = task.astype("str") + "__" + session_id.astype("str") META.append(_meta) @@ -190,4 +288,4 @@ def generate_featres_train(self, data): Y1 = pd.concat(Y1, axis=0).reset_index(drop=True) Y3 = pd.concat(Y3, axis=0).reset_index(drop=True) META = pd.concat(META, axis=0).reset_index(drop=True) - return X, Y1, Y3, META \ No newline at end of file + return X, Y1, Y3, META diff --git a/code/topcoder_cognitive_state/test.py b/code/topcoder_cognitive_state/test.py index cadbf18..025bc1d 100755 --- a/code/topcoder_cognitive_state/test.py +++ b/code/topcoder_cognitive_state/test.py @@ -1,20 +1,23 @@ +from typing import List import sys import warnings -warnings.filterwarnings('ignore') import time - import pickle -import pandas as pd import numpy as np +import pandas as pd from tqdm import tqdm from topcoder_cognitive_state.load_data import read_data -from topcoder_cognitive_state.CONSTANTS import METADATA_COLUMNS from topcoder_cognitive_state.model import Model +warnings.filterwarnings("ignore") -def arrays_to_str_list(arr): + +def arrays_to_str_list(arr: np.array) -> List[str]: + """ + Transform predictions arrays into a list of strings to match the submission format. + """ result = [] for i in range(arr.shape[0]): tmp = list(arr[i, :]) @@ -23,7 +26,10 @@ def arrays_to_str_list(arr): return result -def lists_to_str_list(arr): +def lists_to_str_list(arr: List[List[str]]) -> List[str]: + """ + Transform most important features arrays into a list of strings to match submission format. + """ result = [] for tmp in arr: tmp = "[" + " ".join(["'" + str(s) + "'" for s in tmp]) + "]" @@ -31,14 +37,33 @@ def lists_to_str_list(arr): return result -def make_predictions_for_test_suite(data, test_suite, model): - y_hat_1, y_hat_3, y_hat_1_label, y_hat_3_label, most_important_features = model.predict(data) +def make_predictions_for_test_suite( + data: pd.DataFrame, test_suite: pd.Series, model: Model +) -> pd.DataFrame: + """ + Make predictions for single `test_suite` + + Args: + data (pd.DataFrame): test_suite input data + test_suite (pd.Series): test_suite value + model (Model): model + + Returns: + pd.DataFrame: predictions + """ + ( + y_hat_1, + y_hat_3, + y_hat_1_label, + y_hat_3_label, + most_important_features, + ) = model.predict(data) # combine results result = pd.DataFrame({}) - result['timestamp'] = data.reset_index()['timestamp'] - result['test_suite'] = test_suite + result["timestamp"] = data.reset_index()["timestamp"] + result["test_suite"] = test_suite result["predicted_induced_state"] = y_hat_1_label result["three_sec_predicted_induced_state"] = y_hat_3_label @@ -49,19 +74,32 @@ def make_predictions_for_test_suite(data, test_suite, model): result["top_three_features"] = lists_to_str_list(most_important_features) result_cols = [ - 'timestamp', - 'test_suite', - 'predicted_induced_state', - 'predicted_induced_state_confidence', - 'three_sec_predicted_induced_state', - 'three_sec_predicted_induced_state_confidence', - 'top_three_features' + "timestamp", + "test_suite", + "predicted_induced_state", + "predicted_induced_state_confidence", + "three_sec_predicted_induced_state", + "three_sec_predicted_induced_state_confidence", + "top_three_features", ] result = result[result_cols] return result -def make_predictions(data, dummies, model): +def make_predictions( + data: pd.DataFrame, dummies: pd.DataFrame, model: Model +) -> pd.DataFrame: + """ + Make predictions for raw input data + + Args: + data (pd.DataFrame): input data + dummies (pd.DataFrame): dummies dataframe to match sample submission format + model (Model): model to make predictions + + Returns: + pd.DataFrame: predictions + """ t_start = time.time() # get unique @@ -82,8 +120,10 @@ def make_predictions(data, dummies, model): result = pd.merge(dummies, result, how="left", on=["timestamp", "test_suite"]) # process ts - result['timestamp'] = pd.to_datetime(result['timestamp']).apply(lambda x: x.value) / 10**3 - result['timestamp'] = result['timestamp'].astype("int") + result["timestamp"] = ( + pd.to_datetime(result["timestamp"]).apply(lambda x: x.value) / 10**3 + ) + result["timestamp"] = result["timestamp"].astype("int") t_end = time.time() print(f"Predicions are made. Time: {(t_end-t_start)/60:.2f} minutes") @@ -94,18 +134,18 @@ def main(): if len(sys.argv) < 2 or len(sys.argv[1]) == 0: print("Testing input file is missing.") return 1 - + if len(sys.argv) < 3 or len(sys.argv[2]) == 0: print("Testing output file is missing.") return 1 - - print('Testing started.') + + print("Testing started.") input_file = sys.argv[1] output_file = sys.argv[2] model_file = sys.argv[3] - with open(model_file,'rb') as f: + with open(model_file, "rb") as f: model = pickle.load(f) # load data @@ -114,5 +154,6 @@ def main(): result.to_csv(output_file, index=False) return 0 + if __name__ == "__main__": main() diff --git a/code/topcoder_cognitive_state/train.py b/code/topcoder_cognitive_state/train.py index ee2e4bf..62315c5 100755 --- a/code/topcoder_cognitive_state/train.py +++ b/code/topcoder_cognitive_state/train.py @@ -1,24 +1,23 @@ -from pyexpat import features -from typing import List, Tuple +from typing import List, Optional, Dict, Tuple import sys import warnings -warnings.filterwarnings('ignore') - import pickle -from functools import reduce -import pandas as pd from sklearn.metrics import roc_auc_score import numpy as np from tqdm import tqdm from lightgbm import LGBMClassifier +import pandas as pd from topcoder_cognitive_state.load_data import read_data from topcoder_cognitive_state.processing import FeaturesGenerator from sklearn.model_selection import StratifiedGroupKFold from topcoder_cognitive_state.model import Model +warnings.filterwarnings("ignore") + +# lightgbm model hyperparams PARAMS = [ { "num_leaves": 151, @@ -32,43 +31,50 @@ "reg_lambda": 9.90, "min_child_weight": 0.005519, "min_split_gain": 1.94e-14, - 'random_state': 42 + "random_state": 42, }, - { - 'num_leaves': 79, - 'learning_rate': 0.12, - 'n_estimators': 600, - 'min_child_samples': 14, - 'subsample': 0.75, - 'subsample_freq': 5, - 'colsample_bytree': 0.75, - 'reg_alpha': 2.2, - 'reg_lambda': 1.5, - 'random_state': 424242, - 'min_child_weight': 6.681437316563333e-12, - 'min_split_gain': 0.00039529173804292325, + "num_leaves": 79, + "learning_rate": 0.12, + "n_estimators": 600, + "min_child_samples": 14, + "subsample": 0.75, + "subsample_freq": 5, + "colsample_bytree": 0.75, + "reg_alpha": 2.2, + "reg_lambda": 1.5, + "random_state": 424242, + "min_child_weight": 6.681437316563333e-12, + "min_split_gain": 0.00039529173804292325, }, - { - 'num_leaves': 23, - 'learning_rate': 0.12, - 'n_estimators': 500, - 'min_child_samples': 30, - 'subsample': 0.6, - 'subsample_freq': 5, - 'colsample_bytree': 0.4, - 'reg_alpha': 0, - 'reg_lambda': 0, - 'random_state': 4242, - 'min_child_weight': 0.28, - 'min_split_gain': 9.793058539831146e-08, - } - + "num_leaves": 23, + "learning_rate": 0.12, + "n_estimators": 500, + "min_child_samples": 30, + "subsample": 0.6, + "subsample_freq": 5, + "colsample_bytree": 0.4, + "reg_alpha": 0, + "reg_lambda": 0, + "random_state": 4242, + "min_child_weight": 0.28, + "min_split_gain": 9.793058539831146e-08, + }, ] -def get_auc(y_true, probas_pred): +def get_auc(y_true: np.array, probas_pred: np.array) -> Tuple[str, float, bool]: + """ + Calculate avg auc for multiclass classification + + Args: + y_true (np.array): array of labels + probas_pred (np.array): array of predicted probs + + Returns: + Tuple[str, float, bool]: name of metrics, the value of metric, higher = better? + """ aucs = [] preds = np.array(probas_pred) preds = preds.reshape(-1, 6) @@ -79,14 +85,39 @@ def get_auc(y_true, probas_pred): return "mean_auc", score, True -def drop_null_targets(X_raw, y_raw): +def drop_null_targets( + X_raw: pd.DataFrame, y_raw: pd.Series +) -> Tuple[pd.DataFrame, pd.Series]: + """ + Drop all rows where target is missing + + Args: + X_raw (pd.DataFrame): features + y_raw (pd.Series): targets + + Returns: + Tuple[pd.DataFrame, pd.Series]: features and targets without rows with missing data + """ X = X_raw.reset_index(drop=True) y = y_raw.reset_index(drop=True) ind = y.notnull() return X.loc[ind], y[ind] -def train_model(X, Y1, Y3): +def train_model( + X: pd.DataFrame, Y1: pd.Series, Y3: pd.Series +) -> Tuple[List[LGBMClassifier], List[LGBMClassifier]]: + """ + Train models for t and t+3 predictions + + Args: + X (pd.DataFrame): input features + Y1 (pd.Series): t targets + Y3 (pd.Series): t+3 targets + + Returns: + List[LGBMClassifier]: list of trained models + """ models_1 = [] models_3 = [] for params in PARAMS: @@ -101,19 +132,47 @@ def train_model(X, Y1, Y3): return models_1, models_3 -def train_models(X, Y1, Y3, META, params_to_train=None): +def train_models( + X: pd.Dataframe, + Y1: pd.Series, + Y3: pd.Series, + META: pd.Series, + params_to_train: Optional[List[Dict]] = None, +) -> Tuple[List[LGBMClassifier], List[LGBMClassifier], float]: + """ + Train models + + Args: + X (pd.DataFrame): input features + Y1 (pd.Series): t targets + Y3 (pd.Series): t+3 targets + META (pd.Series): metadata + params_to_train (Optional[List[Dict]], optional): list of dicts of hyperparams to use for training. Defaults to None. + + Returns: + Tuple[List[LGBMClassifier], List[LGBMClassifier], float]: t and t+3 trained models, mean validation score + """ if params_to_train is None: params_to_train = PARAMS - folds = list(StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42).split(X, Y1, META)) - folds += list(StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=100).split(X, Y1, META)) + # use repeated statified group kfold for training + folds = list( + StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42).split( + X, Y1, META + ) + ) + folds += list( + StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=100).split( + X, Y1, META + ) + ) models_1, models_3 = [], [] for params in params_to_train: print("Training model with new params") print(params) - scores_1, scores_3 = [], [] + scores_1 = [] fin_scores = [] for fold_ind, (train_index, test_index) in tqdm(enumerate(folds)): # get split @@ -121,30 +180,31 @@ def train_models(X, Y1, Y3, META, params_to_train=None): y_1_train, y_1_test = Y1[train_index], Y1[test_index] y_3_train, y_3_test = Y3[train_index], Y3[test_index] # meta_train, meta_val = META[train_index], META[test_index] - + # drop null targets X_train_1, y_train_1 = drop_null_targets(X_train, y_1_train) - X_train_3, y_train_3 = drop_null_targets(X_train, y_3_train) + _, _ = drop_null_targets(X_train, y_3_train) X_val_1, y_val_1 = drop_null_targets(X_test, y_1_test) - X_val_3, y_val_3 = drop_null_targets(X_test, y_3_test) - + _, _ = drop_null_targets(X_test, y_3_test) + try: # train models model_1 = LGBMClassifier(**params) model_1.fit( - X_train_1, y_train_1, - eval_set=(X_val_1, y_val_1), - # eval_metric=get_auc, + X_train_1, + y_train_1, + eval_set=(X_val_1, y_val_1), verbose=100, - early_stopping_rounds=50 + early_stopping_rounds=50, ) - auc_1 = get_auc(y_true=y_val_1, probas_pred=model_1.predict_proba(X_val_1))[1] + auc_1 = get_auc( + y_true=y_val_1, probas_pred=model_1.predict_proba(X_val_1) + )[1] scores_1.append(auc_1) models_1.append(model_1) models_3 = models_1 auc_3 = auc_1 - scores_3 = scores_1 fin_score = 0.7 * auc_1 + 0.3 * auc_3 fin_scores.append(fin_score) @@ -159,35 +219,39 @@ def train_models(X, Y1, Y3, META, params_to_train=None): def main(): + """ + Run training + """ if len(sys.argv) < 2 or len(sys.argv[1]) == 0: print("Training input file is missing.") return 1 - + if len(sys.argv) < 3 or len(sys.argv[2]) == 0: print("Training output file is missing.") return 1 - print('Training started.') - + print("Training started.") + input_file = sys.argv[1] output_file = sys.argv[2] data, _ = read_data(input_file) processor = FeaturesGenerator() - X, Y1, Y3, META = processor.generate_featres_train(data) + X, Y1, Y3, _ = processor.generate_features_train(data) models_1, models_3 = train_model(X, Y1, Y3) main_model = Model( features=list(X.columns), - preprocessor=processor, - models_1=models_1, - models_3=models_3 + preprocessor=processor, + models_1=models_1, + models_3=models_3, ) - with open(output_file, 'wb') as f: + with open(output_file, "wb") as f: pickle.dump(main_model, f, pickle.HIGHEST_PROTOCOL) - print('Training finished.') + print("Training finished.") return 0 + if __name__ == "__main__": main() diff --git a/code/topcoder_cognitive_state/utils.py b/code/topcoder_cognitive_state/utils.py deleted file mode 100644 index e69de29..0000000