From 8828be20f9e71fd1b2bbd7ffbaf3249816e981c3 Mon Sep 17 00:00:00 2001
From: Kelly <kelly.macleod@topcoder.com>
Date: Mon, 7 Mar 2022 12:13:42 -0500
Subject: [PATCH] adds flake8 and  fies linting errors

---
 code/.DS_Store                              | Bin 8196 -> 8196 bytes
 code/requirements.txt                       |   4 +-
 code/setup.py                               |   6 +-
 code/topcoder_cognitive_state/CONSTANTS.py  |  16 +-
 code/topcoder_cognitive_state/load_data.py  | 246 ++++++++++++++------
 code/topcoder_cognitive_state/model.py      | 223 ++++++++++++++----
 code/topcoder_cognitive_state/opt_params.py |  67 +++---
 code/topcoder_cognitive_state/processing.py | 216 ++++++++++++-----
 code/topcoder_cognitive_state/test.py       |  89 +++++--
 code/topcoder_cognitive_state/train.py      | 186 ++++++++++-----
 code/topcoder_cognitive_state/utils.py      |   0
 11 files changed, 734 insertions(+), 319 deletions(-)
 delete mode 100644 code/topcoder_cognitive_state/utils.py

diff --git a/code/.DS_Store b/code/.DS_Store
index 764de7cec99531b6154b93adae0c5b8d483cfa41..3a12a32269c94bfee3a2c56ff20c94231758e941 100644
GIT binary patch
delta 38
ucmZp1XmOa}&nUPtU^hRb;AS3y=S-Vbg+DNEe$1o7yqR6%2g}APR%QVF%M7&u

delta 93
zcmZp1XmOa}&nUDpU^hRb&}JTi=S=FH3_%R842}#g48A~Gk0F;K#WN>AIVmSUiGhJZ
pfPsPWGmzHU{6gqA<Hm|lOq<yyez5QZ#Tl%Dc*cJ)*jV_D2>^Y59NGW?

diff --git a/code/requirements.txt b/code/requirements.txt
index 5b51eed..bd3acd8 100644
--- a/code/requirements.txt
+++ b/code/requirements.txt
@@ -4,4 +4,6 @@ lightgbm==3.3.2
 scikit-learn==1.0.2
 tqdm==4.62.3
 scipy==1.7.3
-optuna==2.10.0
\ No newline at end of file
+optuna==2.10.0
+flake8==4.0.1
+black==22.1.0
\ No newline at end of file
diff --git a/code/setup.py b/code/setup.py
index 4476718..5ac8655 100644
--- a/code/setup.py
+++ b/code/setup.py
@@ -12,6 +12,8 @@
     name="topcoder_cognitive_state",
     version="0.0.1",
     packages=find_packages(),
-    py_modules=[splitext(basename(path))[0] for path in glob("topcoder_cognitive_state/*.py")],
+    py_modules=[
+        splitext(basename(path))[0] for path in glob("topcoder_cognitive_state/*.py")
+    ],
     install_requires=requirements,
-)
\ No newline at end of file
+)
diff --git a/code/topcoder_cognitive_state/CONSTANTS.py b/code/topcoder_cognitive_state/CONSTANTS.py
index b580a3a..98e3a80 100644
--- a/code/topcoder_cognitive_state/CONSTANTS.py
+++ b/code/topcoder_cognitive_state/CONSTANTS.py
@@ -1,13 +1,11 @@
 TARGET2LABEL = {
-    "low": 0, 
-    "medium": 1, 
-    "high": 2, 
-    "baseline": 3, 
-    "channelized": 4, 
-    "surprise": 5 
+    "low": 0,
+    "medium": 1,
+    "high": 2,
+    "baseline": 3,
+    "channelized": 4,
+    "surprise": 5,
 }
 LABEL2TARGET = dict(zip(TARGET2LABEL.values(), TARGET2LABEL.keys()))
-        
-
 METADATA_COLUMNS = ["test_suite"]
-NAN_VALUES = [-9999.9]
\ No newline at end of file
+NAN_VALUES = [-9999.9]
diff --git a/code/topcoder_cognitive_state/load_data.py b/code/topcoder_cognitive_state/load_data.py
index 4add9bd..1c41aea 100644
--- a/code/topcoder_cognitive_state/load_data.py
+++ b/code/topcoder_cognitive_state/load_data.py
@@ -1,97 +1,171 @@
-import os
-from multiprocessing import Pool
+from typing import Tuple
 import time
 
-import pandas as pd 
+import pandas as pd
 from tqdm import tqdm
 
 from topcoder_cognitive_state.CONSTANTS import METADATA_COLUMNS, NAN_VALUES
 
 
-def _test_missing_features(df):
+def _test_missing_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    This test contains three tests which are run manually:
+
+        1. Check if some columns are missing
+        2. Check if some columns have None values
+        3. Check if some columns have -9999.9 (missing) values
+    """
     # cols = ['ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z']
     # cols = ['Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6']
     # cols = ['Polar_bpm', 'Polar_hrv', 'tlx_score']
     cols = [
         # features
-        'tlx_score', 'E4_BVP', 'E4_GSR', 'LooxidLink_EEG_A3', 'LooxidLink_EEG_A4', 
-        'LooxidLink_EEG_FP1', 'LooxidLink_EEG_FP2', 'LooxidLink_EEG_A7', 'LooxidLink_EEG_A8', 
-        
-        'Muse_EEG_TP9', 'Muse_EEG_AF7', 'Muse_EEG_AF8', 'Muse_EEG_TP10', 
-        'Muse_PPG_0', 'Muse_PPG_1', 'Muse_PPG_2', 
-        
-        'Myo_GYR_X', 'Myo_GYR_Y', 'Myo_GYR_Z', 
-        'Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6', 'Myo_EMG_7', 
-        
-        'PICARD_fnirs_0', 'PICARD_fnirs_1', 
-        
-        'Polar_bpm', 'Polar_hrv', 
-        
-        'ViveEye_pupilPos_L_X', 'ViveEye_pupilPos_L_Y', 
-        'ViveEye_pupilPos_R_X', 'ViveEye_pupilPos_R_Y', 
-        
-        'ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z', 
-        'ViveEye_gazeOrigin_R_X', 'ViveEye_gazeOrigin_R_Y', 'ViveEye_gazeOrigin_R_Z',
-        'ViveEye_gazeDirection_L_X', 'ViveEye_gazeDirection_L_Y', 'ViveEye_gazeDirection_L_Z', 
-        'ViveEye_gazeDirection_R_X', 'ViveEye_gazeDirection_R_Y', 'ViveEye_gazeDirection_R_Z', 
-
-        'ViveEye_eyeOpenness_L', 'ViveEye_pupilDiameter_L',
-        'ViveEye_eyeOpenness_R', 'ViveEye_pupilDiameter_R', 
-
-        'Zephyr_HR', 'Zephyr_HRV',
+        "tlx_score",
+        "E4_BVP",
+        "E4_GSR",
+        "LooxidLink_EEG_A3",
+        "LooxidLink_EEG_A4",
+        "LooxidLink_EEG_FP1",
+        "LooxidLink_EEG_FP2",
+        "LooxidLink_EEG_A7",
+        "LooxidLink_EEG_A8",
+        "Muse_EEG_TP9",
+        "Muse_EEG_AF7",
+        "Muse_EEG_AF8",
+        "Muse_EEG_TP10",
+        "Muse_PPG_0",
+        "Muse_PPG_1",
+        "Muse_PPG_2",
+        "Myo_GYR_X",
+        "Myo_GYR_Y",
+        "Myo_GYR_Z",
+        "Myo_EMG_0",
+        "Myo_EMG_1",
+        "Myo_EMG_2",
+        "Myo_EMG_3",
+        "Myo_EMG_4",
+        "Myo_EMG_5",
+        "Myo_EMG_6",
+        "Myo_EMG_7",
+        "PICARD_fnirs_0",
+        "PICARD_fnirs_1",
+        "Polar_bpm",
+        "Polar_hrv",
+        "ViveEye_pupilPos_L_X",
+        "ViveEye_pupilPos_L_Y",
+        "ViveEye_pupilPos_R_X",
+        "ViveEye_pupilPos_R_Y",
+        "ViveEye_gazeOrigin_L_X",
+        "ViveEye_gazeOrigin_L_Y",
+        "ViveEye_gazeOrigin_L_Z",
+        "ViveEye_gazeOrigin_R_X",
+        "ViveEye_gazeOrigin_R_Y",
+        "ViveEye_gazeOrigin_R_Z",
+        "ViveEye_gazeDirection_L_X",
+        "ViveEye_gazeDirection_L_Y",
+        "ViveEye_gazeDirection_L_Z",
+        "ViveEye_gazeDirection_R_X",
+        "ViveEye_gazeDirection_R_Y",
+        "ViveEye_gazeDirection_R_Z",
+        "ViveEye_eyeOpenness_L",
+        "ViveEye_pupilDiameter_L",
+        "ViveEye_eyeOpenness_R",
+        "ViveEye_pupilDiameter_R",
+        "Zephyr_HR",
+        "Zephyr_HRV",
     ]
 
     # case 1 - no column
-    # df = df.drop(cols, axis=1) 
+    # df = df.drop(cols, axis=1)
 
     # case 2 - None values
-    #for col in cols:
+    # for col in cols:
     #    df[col] = None
 
-    # case 3 - missing values 
+    # case 3 - missing values
     for col in cols:
         df[col] = -9999.9
     return df
 
 
-def read_and_prepare_data_chunk(df):
+def read_and_prepare_data_chunk(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Read raw data and prepare it for processing.
+    I.e., create columns if they are missing,
+    replace missing values with None,
+    etc.
+
+    Args:
+        df (pd.DataFrame): input raw data
+
+    Returns:
+        pd.DataFrame: processed data
+    """
     EXPECTED_COLUMNS = [
         # features
-        'tlx_score', 'E4_BVP', 'E4_GSR', 'LooxidLink_EEG_A3', 'LooxidLink_EEG_A4', 
-        'LooxidLink_EEG_FP1', 'LooxidLink_EEG_FP2', 'LooxidLink_EEG_A7', 'LooxidLink_EEG_A8', 
-        
-        'Muse_EEG_TP9', 'Muse_EEG_AF7', 'Muse_EEG_AF8', 'Muse_EEG_TP10', 
-        'Muse_PPG_0', 'Muse_PPG_1', 'Muse_PPG_2', 
-        
-        'Myo_GYR_X', 'Myo_GYR_Y', 'Myo_GYR_Z', 
-        'Myo_EMG_0', 'Myo_EMG_1', 'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6', 'Myo_EMG_7', 
-        
-        'PICARD_fnirs_0', 'PICARD_fnirs_1', 
-        
-        'Polar_bpm', 'Polar_hrv', 
-        
-        'ViveEye_pupilPos_L_X', 'ViveEye_pupilPos_L_Y', 
-        'ViveEye_pupilPos_R_X', 'ViveEye_pupilPos_R_Y', 
-        
-        'ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z', 
-        'ViveEye_gazeOrigin_R_X', 'ViveEye_gazeOrigin_R_Y', 'ViveEye_gazeOrigin_R_Z',
-        'ViveEye_gazeDirection_L_X', 'ViveEye_gazeDirection_L_Y', 'ViveEye_gazeDirection_L_Z', 
-        'ViveEye_gazeDirection_R_X', 'ViveEye_gazeDirection_R_Y', 'ViveEye_gazeDirection_R_Z', 
-
-        'ViveEye_eyeOpenness_L', 'ViveEye_pupilDiameter_L',
-        'ViveEye_eyeOpenness_R', 'ViveEye_pupilDiameter_R', 
-
-        'Zephyr_HR', 'Zephyr_HRV',
-
+        "tlx_score",
+        "E4_BVP",
+        "E4_GSR",
+        "LooxidLink_EEG_A3",
+        "LooxidLink_EEG_A4",
+        "LooxidLink_EEG_FP1",
+        "LooxidLink_EEG_FP2",
+        "LooxidLink_EEG_A7",
+        "LooxidLink_EEG_A8",
+        "Muse_EEG_TP9",
+        "Muse_EEG_AF7",
+        "Muse_EEG_AF8",
+        "Muse_EEG_TP10",
+        "Muse_PPG_0",
+        "Muse_PPG_1",
+        "Muse_PPG_2",
+        "Myo_GYR_X",
+        "Myo_GYR_Y",
+        "Myo_GYR_Z",
+        "Myo_EMG_0",
+        "Myo_EMG_1",
+        "Myo_EMG_2",
+        "Myo_EMG_3",
+        "Myo_EMG_4",
+        "Myo_EMG_5",
+        "Myo_EMG_6",
+        "Myo_EMG_7",
+        "PICARD_fnirs_0",
+        "PICARD_fnirs_1",
+        "Polar_bpm",
+        "Polar_hrv",
+        "ViveEye_pupilPos_L_X",
+        "ViveEye_pupilPos_L_Y",
+        "ViveEye_pupilPos_R_X",
+        "ViveEye_pupilPos_R_Y",
+        "ViveEye_gazeOrigin_L_X",
+        "ViveEye_gazeOrigin_L_Y",
+        "ViveEye_gazeOrigin_L_Z",
+        "ViveEye_gazeOrigin_R_X",
+        "ViveEye_gazeOrigin_R_Y",
+        "ViveEye_gazeOrigin_R_Z",
+        "ViveEye_gazeDirection_L_X",
+        "ViveEye_gazeDirection_L_Y",
+        "ViveEye_gazeDirection_L_Z",
+        "ViveEye_gazeDirection_R_X",
+        "ViveEye_gazeDirection_R_Y",
+        "ViveEye_gazeDirection_R_Z",
+        "ViveEye_eyeOpenness_L",
+        "ViveEye_pupilDiameter_L",
+        "ViveEye_eyeOpenness_R",
+        "ViveEye_pupilDiameter_R",
+        "Zephyr_HR",
+        "Zephyr_HRV",
         # target
-        "induced_state"
+        "induced_state",
     ]
 
+    # uncomment to enable test
     # df = _test_missing_features(df)
 
     # test_suite
-    if 'test_suite' not in df.columns:
-        df['test_suite'] = "test"
+    if "test_suite" not in df.columns:
+        df["test_suite"] = "test"
 
     df["time"] = pd.to_datetime(df["time"], unit="us")
     df["timestamp"] = df["time"].dt.round("1s")
@@ -118,30 +192,48 @@ def read_and_prepare_data_chunk(df):
     return ags
 
 
-def get_dummy_template(df):
+def get_dummy_template(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    The template is needed to match the expected sample submission format.
+    """
     df["time"] = pd.to_datetime(df["time"], unit="us")
     df["timestamp"] = df["time"].dt.round("1s")
     df = df.drop("time", axis=1)
     dummy_template = df.drop_duplicates(
-        subset=METADATA_COLUMNS + ["timestamp"], 
-        keep="first"
+        subset=METADATA_COLUMNS + ["timestamp"], keep="first"
     ).reset_index(drop=True)
     dummy_template = dummy_template[METADATA_COLUMNS + ["timestamp"]]
     return dummy_template
 
 
-def get_needed_data(df):
+def get_needed_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Read data for training/testing and prepare template format for submission
+
+    Return:
+        res1 - pd.DataFrame - read data
+        res2 - pd.DataFrame - template for submission
+    """
     res1 = read_and_prepare_data_chunk(df)
     res2 = get_dummy_template(df)
-    return [res1, res2]
+    return res1, res2
 
 
 def read_data(
-    path_to_data: str, 
-    debug: bool = False
-) -> pd.DataFrame:
+    path_to_data: str, debug: bool = False
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Read data. The data is read in chunks to reduce memory consumption.
+
+    Args:
+        path_to_data (str): path to data
+        debug (bool, optional): run data loading on a sample of data. Defaults to False.
+
+    Returns:
+        Tuple[pd.DataFrame, pd.DataFrame]: Read data and prepared template for submission
+    """
     t_start = time.time()
-    chunksize = 10 ** 6
+    chunksize = 10**6
 
     if path_to_data is None:
         path_to_data = "./data/training-data.zip"
@@ -152,11 +244,9 @@ def read_data(
     else:
         nrows = None
 
+    # create chunks iterator to read data
     chunks = pd.read_csv(
-        path_to_data, 
-        na_values=NAN_VALUES, 
-        chunksize=chunksize, 
-        nrows=nrows
+        path_to_data, na_values=NAN_VALUES, chunksize=chunksize, nrows=nrows
     )
 
     # get data for processing
@@ -167,12 +257,14 @@ def read_data(
     res = [i[0] for i in full_result]
     res = pd.concat(res, axis=0)
     res = res.sort_index()
-    res = res[~res.index.duplicated(keep='first')]
+    res = res[~res.index.duplicated(keep="first")]
 
     # collect dummies for sub
     res2 = [i[1] for i in full_result]
     res2 = pd.concat(res2, axis=0)
-    res2 = res2.drop_duplicates(subset=METADATA_COLUMNS + ["timestamp"], keep="first").reset_index(drop=True)
+    res2 = res2.drop_duplicates(
+        subset=METADATA_COLUMNS + ["timestamp"], keep="first"
+    ).reset_index(drop=True)
     t_end = time.time()
     print(f"Data is read. Time per reading: {(t_end-t_start)/60:.2f} minutes")
     return res, res2
diff --git a/code/topcoder_cognitive_state/model.py b/code/topcoder_cognitive_state/model.py
index cfe5bf6..fcd3527 100644
--- a/code/topcoder_cognitive_state/model.py
+++ b/code/topcoder_cognitive_state/model.py
@@ -1,25 +1,140 @@
 from typing import List, Tuple
 
-from functools import reduce
-
 import pandas as pd
 import numpy as np
-from scipy import stats
+from lightgbm import LGBMClassifier
+
+from topcoder_cognitive_state.processing import FeaturesGenerator
+
+
+ALLOWED_IMPORTANT_FEATURES = set(
+    [
+        "E4_BVP",
+        "E4_GSR",
+        "LooxidLink_EEG_A3",
+        "LooxidLink_EEG_A4",
+        "LooxidLink_EEG_FP1",
+        "LooxidLink_EEG_FP2",
+        "LooxidLink_EEG_A7",
+        "LooxidLink_EEG_A8",
+        "Muse_EEG_TP9",
+        "Muse_EEG_AF7",
+        "Muse_EEG_AF8",
+        "Muse_EEG_TP10",
+        "Muse_PPG_0",
+        "Muse_PPG_1",
+        "Muse_PPG_2",
+        "Myo_GYR_X",
+        "Myo_GYR_Y",
+        "Myo_GYR_Z",
+        "Myo_EMG_0",
+        "Myo_EMG_1",
+        "Myo_EMG_2",
+        "Myo_EMG_3",
+        "Myo_EMG_4",
+        "Myo_EMG_5",
+        "Myo_EMG_6",
+        "Myo_EMG_7",
+        "PICARD_fnirs_0",
+        "PICARD_fnirs_1",
+        "Polar_bpm",
+        "Polar_hrv",
+        "ViveEye_eyeOpenness_L",
+        "ViveEye_pupilDiameter_L",
+        "ViveEye_pupilPos_L_X",
+        "ViveEye_pupilPos_L_Y",
+        "ViveEye_gazeOrigin_L_X",
+        "ViveEye_gazeOrigin_L_Y",
+        "ViveEye_gazeOrigin_L_Z",
+        "ViveEye_gazeDirection_L_X",
+        "ViveEye_gazeDirection_L_Y",
+        "ViveEye_gazeDirection_L_Z",
+        "ViveEye_eyeOpenness_R",
+        "ViveEye_pupilDiameter_R",
+        "ViveEye_pupilPos_R_X",
+        "ViveEye_pupilPos_R_Y",
+        "ViveEye_gazeOrigin_R_X",
+        "ViveEye_gazeOrigin_R_Y",
+        "ViveEye_gazeOrigin_R_Z",
+        "ViveEye_gazeDirection_R_X",
+        "ViveEye_gazeDirection_R_Y",
+        "ViveEye_gazeDirection_R_Z",
+        "Zephyr_HR",
+        "Zephyr_HRV",
+    ]
+)
+
+
+class Model:
+    def __init__(
+        self,
+        features: List[str],
+        preprocessor: FeaturesGenerator,
+        models_1: List[LGBMClassifier],
+        models_3: List[LGBMClassifier],
+    ):
+        """
+        Model
+
+        Args:
+            features (List[str]): list of features which were used during training
+            preprocessor (FeaturesGenerator): preprocessor
+            models_1 (List[LGBMClassifier]): list model models for t predictions
+            models_3 (List[LGBMClassifier]): list model models for t+3 predictions
+        """
+        self.features = features
+        self.preprocessor = preprocessor
+        self.models_1 = models_1
+        self.models_3 = models_3
+        self.num_classes = 6
+
+    def ensure_features(self, x: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform dataframe into the same format which was used during training.
+        I.e., ensure that all features are present. If a feature is missing - replace it with 0.
+        Ensure order of features in the dataframe.
+
+        Args:
+            x (pd.DataFrame): input data
+
+        Returns:
+            pd.DataFrame: processed data
+        """
+        for col in self.features:
+            if col not in x.columns:
+                x[col] = 0
+        x = x[self.features]
+        return x
+
+    def predict(
+        self, x_raw: pd.DataFrame
+    ) -> Tuple[np.array, np.array, np.array, np.array, List[List[str]]]:
+        """
+        Make predictions for t and t+3, find the most important features
 
+        Args:
+            x_raw (pd.DataFrame): raw input data
 
-ALLOWED_IMPORTANT_FEATURES = set(['E4_BVP', 'E4_GSR', 'LooxidLink_EEG_A3', 'LooxidLink_EEG_A4', 'LooxidLink_EEG_FP1', 'LooxidLink_EEG_FP2',
-            'LooxidLink_EEG_A7', 'LooxidLink_EEG_A8', 'Muse_EEG_TP9', 'Muse_EEG_AF7', 'Muse_EEG_AF8', 'Muse_EEG_TP10',
-            'Muse_PPG_0', 'Muse_PPG_1', 'Muse_PPG_2', 'Myo_GYR_X', 'Myo_GYR_Y', 'Myo_GYR_Z', 'Myo_EMG_0', 'Myo_EMG_1',
-            'Myo_EMG_2', 'Myo_EMG_3', 'Myo_EMG_4', 'Myo_EMG_5', 'Myo_EMG_6', 'Myo_EMG_7', 'PICARD_fnirs_0', 'PICARD_fnirs_1',
-            'Polar_bpm', 'Polar_hrv', 'ViveEye_eyeOpenness_L', 'ViveEye_pupilDiameter_L', 'ViveEye_pupilPos_L_X',
-            'ViveEye_pupilPos_L_Y', 'ViveEye_gazeOrigin_L_X', 'ViveEye_gazeOrigin_L_Y', 'ViveEye_gazeOrigin_L_Z',
-            'ViveEye_gazeDirection_L_X', 'ViveEye_gazeDirection_L_Y', 'ViveEye_gazeDirection_L_Z', 'ViveEye_eyeOpenness_R',
-            'ViveEye_pupilDiameter_R', 'ViveEye_pupilPos_R_X', 'ViveEye_pupilPos_R_Y', 'ViveEye_gazeOrigin_R_X',
-            'ViveEye_gazeOrigin_R_Y', 'ViveEye_gazeOrigin_R_Z', 'ViveEye_gazeDirection_R_X', 'ViveEye_gazeDirection_R_Y',
-            'ViveEye_gazeDirection_R_Z', 'Zephyr_HR', 'Zephyr_HRV'])
+        Returns:
+            Tuple[np.array, np.array, np.array, np.array, List[List[str]]]: predictions and most important features
+        """
+        # generate features
+        x, _, _, group = self.preprocessor.generate_features(x_raw, get_targers=False)
+
+        # make predictions
+        y_hat_1, most_important_features = make_predictions(x, group, self.models_1)
+        y_hat_3 = y_hat_1
+
+        # transform predictions into labels
+        y_hat_1_label = self.preprocessor.apply_label2target(np.argmax(y_hat_1, axis=1))
+        y_hat_3_label = y_hat_1_label
+        return y_hat_1, y_hat_3, y_hat_1_label, y_hat_3_label, most_important_features
 
 
-def sum_arrays(arrs):
+def sum_arrays(arrs: List[np.array]) -> np.array:
+    """
+    Calculate sum of list of arrays
+    """
     x = arrs[0]
     if len(arrs) == 1:
         return x
@@ -28,21 +143,49 @@ def sum_arrays(arrs):
     return x
 
 
-def mean_arrays(arrs):
+def mean_arrays(arrs) -> np.array:
+    """
+    Calculate the mean of the list of arrays
+    """
     return sum_arrays(arrs) / len(arrs)
 
 
-def postprocess_preds(preds, group):
+def postprocess_preds(preds: np.array, group: pd.Series) -> pd.DataFrame:
+    """
+    Smooth predictions by running rolling mean within the group
+
+    Args:
+        preds (np.array): array of predicted probs
+        group (pd.Series): group id
+
+    Returns:
+        pd.DataFrame: smoothed predicted probs
+    """
     df = pd.DataFrame(preds)
-    df['group'] = group
-    res = df.groupby('group').rolling(window=999_999, min_periods=1).mean().values
+    df["group"] = group
+    res = df.groupby("group").rolling(window=999_999, min_periods=1).mean().values
     return res
 
 
-def make_predictions(data: pd.DataFrame, group: pd.Series, models) -> Tuple[np.array, List[List[str]]]:
+def make_predictions(
+    data: pd.DataFrame, group: pd.Series, models: List[LGBMClassifier]
+) -> Tuple[np.array, List[List[str]]]:
+    """
+    Make predictions and calculate the most important features
+
+    Args:
+        data (pd.DataFrame): input raw data
+        group (pd.Series): group id for each row in input data
+        models (List[Model]): list of models. Final predictions is
+            avg prediction of all models predictions.
+
+    Returns:
+        Tuple[np.array, List[List[str]]]: predictions and most important features
+    """
     y_hat = []
     shap_values = []
     for model in models:
+        # make predictions and calculate SHAP importance
         y_hat_pred = model.predict_proba(data)
         contribs = model.predict_proba(data, pred_contrib=True)
 
@@ -50,24 +193,30 @@ def make_predictions(data: pd.DataFrame, group: pd.Series, models) -> Tuple[np.a
         # drop shap sum column: (n_samples, (n_features + 1) * n_classes) -> n_samples, n_features * n_classes
         y_shap_pred = np.vstack(contribs)
         n_features = data.shape[1]
-        indexes = [i for i in range(y_shap_pred.shape[1]) if (i // n_features > 0) and (i % n_features == 1)]
+        indexes = [
+            i
+            for i in range(y_shap_pred.shape[1])
+            if (i // n_features > 0) and (i % n_features == 1)
+        ]
         y_shap_pred = y_shap_pred[:, indexes]
 
+        # smooth predictions and shap importance using group id
         y_hat_pred = postprocess_preds(y_hat_pred, group)
         y_shap_pred = postprocess_preds(y_shap_pred, group)
 
         y_hat.append(y_hat_pred)
         shap_values.append(y_shap_pred)
-    
+
+    # calculate mean for predictions and SHAP importance
     y_hat = mean_arrays(y_hat)
     shap_values = mean_arrays(shap_values)
 
-    # select most important featuers using shap values
+    # select most important featuers from list of allowed features
     most_important_features = []
     topn_featuers = 3
     for i in range(shap_values.shape[0]):
         row = shap_values[i, :]
-        ind_argsorted = np.argsort(row)[::-1] # high -> low
+        ind_argsorted = np.argsort(row)[::-1]  # high -> low
         tmp = []
         for j in ind_argsorted:
             feature = list(data.columns)[j % n_features]
@@ -77,31 +226,3 @@ def make_predictions(data: pd.DataFrame, group: pd.Series, models) -> Tuple[np.a
         tmp = tmp[:topn_featuers]
         most_important_features.append(tmp)
     return y_hat, most_important_features
-    
-
-class Model:
-    def __init__(self, features: List[str], preprocessor, models_1, models_3):
-        self.features = features
-        self.preprocessor = preprocessor
-        self.models_1 = models_1
-        self.models_3 = models_3
-        self.num_classes = 6
-
-    def ensure_features(self, x):
-        for col in self.features:
-            if col not in x.columns:
-                x[col] = 0
-        x = x[self.features]
-        return x
-    
-    def predict(self, x_raw):
-        x, _, _, group = self.preprocessor.generate_featres(x_raw, get_targers=False)
-
-        # get probs
-        y_hat_1, most_important_features = make_predictions(x, group, self.models_1)
-        y_hat_3 = y_hat_1
-
-        # get labels
-        y_hat_1_label = self.preprocessor.apply_label2target(np.argmax(y_hat_1, axis=1))
-        y_hat_3_label = y_hat_1_label
-        return y_hat_1, y_hat_3, y_hat_1_label, y_hat_3_label, most_important_features
\ No newline at end of file
diff --git a/code/topcoder_cognitive_state/opt_params.py b/code/topcoder_cognitive_state/opt_params.py
index ecb1e4c..019fccb 100644
--- a/code/topcoder_cognitive_state/opt_params.py
+++ b/code/topcoder_cognitive_state/opt_params.py
@@ -1,33 +1,26 @@
-from typing import List, Tuple
 import sys
 import warnings
-warnings.filterwarnings('ignore')
-import copy 
+import copy
 import logging
-
-import pickle
-from functools import reduce
-
 import optuna
-import pandas as pd
-from sklearn.metrics import roc_auc_score
-import numpy as np
-from tqdm import tqdm
-from lightgbm import LGBMClassifier
 
 from topcoder_cognitive_state.load_data import read_data
 from topcoder_cognitive_state.processing import FeaturesGenerator
-from sklearn.model_selection import StratifiedGroupKFold
-from topcoder_cognitive_state.model import Model
+from topcoder_cognitive_state.model import Model  # noqa: F401
 from topcoder_cognitive_state.train import train_models
 
 
+warnings.filterwarnings("ignore")
+
 
 def main():
+    """
+    Optimize model's hyperparams using optuna
+    """
     if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
         print("Training input file is missing.")
         return 1
-    
+
     if len(sys.argv) < 3 or len(sys.argv[2]) == 0:
         print("Path to log is missing")
         return 1
@@ -39,14 +32,14 @@ def main():
         handlers=[logging.FileHandler(path_to_log, mode="w"), logging.StreamHandler()],
     )
 
-    print('Training started.')
-    
+    print("Training started.")
+
     input_file = sys.argv[1]
-    output_file = sys.argv[2]
+    _ = sys.argv[2]
 
     data, _ = read_data(input_file)
     processor = FeaturesGenerator()
-    X, Y1, Y3, META = processor.generate_featres_train(data)
+    X, Y1, Y3, META = processor.generate_features_train(data)
 
     default_params = {
         "num_leaves": 127,
@@ -58,25 +51,28 @@ def main():
         "colsample_bytree": 0.67,
         "reg_alpha": 1.0,
         "reg_lambda": 1.0,
-        'random_state': 42
+        "random_state": 42,
     }
 
     def objective(trial):
         params = copy.deepcopy(default_params)
-        params.update({
-            "num_leaves": trial.suggest_int('num_leaves', 7, 255, 8),
-            "max_depth": trial.suggest_int('max_depth', 3, 14, 1),
-
-            "min_child_weight": trial.suggest_loguniform('min_child_weight', 1e-18, 1),
-            "min_child_samples": trial.suggest_int('min_child_samples', 1, 100, 1, log=True),
-            "min_split_gain": trial.suggest_loguniform('min_split_gain', 1e-18, 1),
-
-            "subsample": trial.suggest_float('subsample', 0.1, 1.0),
-            "colsample_bytree": trial.suggest_float('colsample_bytree', 0.1, 1.0),
-
-            "reg_alpha": trial.suggest_float('reg_alpha', 0.0, 10),
-            "reg_lambda": trial.suggest_float('reg_lambda', 0.0, 10),
-        })
+        params.update(
+            {
+                "num_leaves": trial.suggest_int("num_leaves", 7, 255, 8),
+                "max_depth": trial.suggest_int("max_depth", 3, 14, 1),
+                "min_child_weight": trial.suggest_loguniform(
+                    "min_child_weight", 1e-18, 1
+                ),
+                "min_child_samples": trial.suggest_int(
+                    "min_child_samples", 1, 100, 1, log=True
+                ),
+                "min_split_gain": trial.suggest_loguniform("min_split_gain", 1e-18, 1),
+                "subsample": trial.suggest_float("subsample", 0.1, 1.0),
+                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
+                "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10),
+                "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10),
+            }
+        )
         _, _, test_score = train_models(X, Y1, Y3, META, params_to_train=[params])
 
         logging.info(f"Next itter score - {test_score}")
@@ -96,5 +92,6 @@ def objective(trial):
     for key, value in best_trial.params.items():
         logging.info("    {}: {}".format(key, value))
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/code/topcoder_cognitive_state/processing.py b/code/topcoder_cognitive_state/processing.py
index 3c7f804..b6b7d35 100644
--- a/code/topcoder_cognitive_state/processing.py
+++ b/code/topcoder_cognitive_state/processing.py
@@ -1,95 +1,152 @@
+from typing import List, Tuple
 import re
 
-import pandas as pd 
+import pandas as pd
 import numpy as np
 from tqdm import tqdm
 
 from topcoder_cognitive_state.CONSTANTS import TARGET2LABEL, LABEL2TARGET
 
 
-def get_distance(x1, x2):
+def get_distance(x1: List[pd.Series], x2: List[pd.Series]) -> pd.Series:
+    """
+    Calculate l2 distance between points
+
+    Args:
+        x1 (List[pd.Series]): list of point x1 coordinates, e.g. X1, Y1, Z1
+        x2 (List[pd.Series]): list of point x2 coordinates, e.g. X2, Y2, Z2
+
+    Returns:
+        pd.Series: l2 distance
+    """
     delta = 0
-    for a1, a2 in zip(x1,x2):
+    for a1, a2 in zip(x1, x2):
         delta += (a1 - a2) ** 2
     delta = delta ** (1 / len(x1))
     return delta
 
 
 class FeaturesGenerator:
-    def __init__(self, target_column="induced_state"):
+    def __init__(self, target_column: str = "induced_state"):
+        """
+        Generate features using raw data
+
+        Args:
+            target_column (str, optional): target column. Defaults to "induced_state".
+        """
         self.target_column = target_column
-        
+
         self.target2label = TARGET2LABEL
         self.label2target = LABEL2TARGET
 
-    def rename_cols(self, x):
-        x = x.rename(columns = lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
+    def rename_cols(self, x: pd.DataFrame):
+        """
+        Lightgbm doesn't work well with all features names, so we need to rename some of them."""
+        x = x.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x))
         return x
 
-    def get_targets(self, x, future: int = 1):
+    def get_targets(self, x: pd.DataFrame, future: int = 1) -> pd.Series:
+        """
+        Create training targets
+
+        Args:
+            x (pd.DataFrame): input data
+            future (int, optional): steps to look into future.
+                Future=1 equals the current moment (t).
+                Future=2 equals to the next moment (t+1).
+                Defaults to 1.
+
+        Returns:
+            pd.Series: target
+        """
         if future == 0:
             y = x[self.target_column]
         else:
-            y = x.groupby("session_id")[self.target_column].shift(-1 * future).fillna(method='ffill')
+            y = (
+                x.groupby("session_id")[self.target_column]
+                .shift(-1 * future)
+                .fillna(method="ffill")  # replace missing targets with previous value
+            )
             ind = y.isnull()
             y[ind] = x.loc[ind, self.target_column]
         y_label = self.apply_target2label(y)
         return y_label
-    
-    def apply_target2label(self, y):
+
+    def apply_target2label(self, y: pd.Series) -> pd.Series:
+        # 'low' -> 0, 'medium' -> 1, etc
         return pd.Series(y).map(self.target2label)
-    
-    def apply_label2target(self, y):
+
+    def apply_label2target(self, y: pd.Series) -> pd.Series:
+        #  -> 'low', 1 -> 'medium', etc
         return pd.Series(y).map(self.label2target)
 
-    def calc_eyes_distances(self, x):
+    def calc_eyes_distances(self, x: pd.DataFrame) -> pd.DataFrame:
+        """
+        Calculate features based on raw features from the eyes tracker.
+
+        Args:
+            x (pd.DataFrame): input data
+
+        Returns:
+            pd.DataFrame: new dataframe with new features
+        """
         new_features = pd.DataFrame({})
         shifts = [1, 3]
         eyes = ["L", "R"]
 
         # L / R eyes
         new_features["ViveEye_pupilPos_LR_distance"] = get_distance(
-                (x["ViveEye_pupilPos_L_X"], x["ViveEye_pupilPos_L_Y"]), 
-                (x["ViveEye_pupilPos_R_X"], x["ViveEye_pupilPos_R_Y"])
-            )
+            (x["ViveEye_pupilPos_L_X"], x["ViveEye_pupilPos_L_Y"]),
+            (x["ViveEye_pupilPos_R_X"], x["ViveEye_pupilPos_R_Y"]),
+        )
 
         # distances
         for s in shifts:
             for pos in eyes:
                 new_features[f"ViveEye_pupilPos_distance_{s}_{pos}"] = get_distance(
+                    (x[f"ViveEye_pupilPos_{pos}_X"], x[f"ViveEye_pupilPos_{pos}_Y"]),
                     (
-                        x[f"ViveEye_pupilPos_{pos}_X"], 
-                        x[f"ViveEye_pupilPos_{pos}_Y"]
-                    ), 
-                    (
-                        x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_X"].shift(s).values, 
-                        x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_Y"].shift(s).values
-                    )
+                        x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_X"]
+                        .shift(s)
+                        .values,
+                        x.groupby("session_id")[f"ViveEye_pupilPos_{pos}_Y"]
+                        .shift(s)
+                        .values,
+                    ),
                 )
-    
+
         # L / R eyes
         for feature in ["ViveEye_gazeOrigin", "ViveEye_gazeDirection"]:
             new_features["{feature}_LR_distance"] = get_distance(
-                (x[f"{feature}_L_X"], x[f"{feature}_L_Y"], x[f"{feature}_L_Z"]), 
-                (x[f"{feature}_R_X"], x[f"{feature}_R_Y"], x[f"{feature}_R_Z"])
+                (x[f"{feature}_L_X"], x[f"{feature}_L_Y"], x[f"{feature}_L_Z"]),
+                (x[f"{feature}_R_X"], x[f"{feature}_R_Y"], x[f"{feature}_R_Z"]),
             )
             for s in shifts:
                 for pos in eyes:
                     new_features[f"distance_{feature}_{s}_{pos}"] = get_distance(
                         (
-                            x[f"{feature}_{pos}_X"], 
-                            x[f"{feature}_{pos}_Y"], 
-                            x[f"{feature}_{pos}_Z"]
-                        ), 
+                            x[f"{feature}_{pos}_X"],
+                            x[f"{feature}_{pos}_Y"],
+                            x[f"{feature}_{pos}_Z"],
+                        ),
                         (
-                            x.groupby("session_id")[f"{feature}_{pos}_X"].shift(s).values, 
-                            x.groupby("session_id")[f"{feature}_{pos}_Y"].shift(s).values, 
-                            x.groupby("session_id")[f"{feature}_{pos}_Z"].shift(s).values
-                        ), 
+                            x.groupby("session_id")[f"{feature}_{pos}_X"]
+                            .shift(s)
+                            .values,
+                            x.groupby("session_id")[f"{feature}_{pos}_Y"]
+                            .shift(s)
+                            .values,
+                            x.groupby("session_id")[f"{feature}_{pos}_Z"]
+                            .shift(s)
+                            .values,
+                        ),
                     )
         return new_features
 
-    def get_session_id_and_time_since_break(self, data):
+    def get_session_id_and_time_since_break(self, data: pd.DataFrame) -> pd.DataFrame:
+        """
+        Calculate session id and time since the last break.
+        """
         tmp = data.copy().reset_index()
         diff = tmp["timestamp"].diff().dt.total_seconds()
         tmp["session_id"] = np.cumsum(diff > 1)
@@ -97,11 +154,23 @@ def get_session_id_and_time_since_break(self, data):
         data["session_id"] = tmp["session_id"].values
         return data
 
-    def generate_featres(self, x, get_targers: bool = False):
+    def generate_features(
+        self, x: pd.DataFrame, get_targers: bool = False
+    ) -> Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]:
+        """
+        Process raw data and create new features for training
+
+        Args:
+            x (pd.DataFrame): raw input data
+            get_targers (bool, optional): generate targets? Needed for training. Defaults to False.
+
+        Returns:
+            Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]: output
+        """
         x = self.get_session_id_and_time_since_break(x)
         x = x.reset_index(drop=True)
 
-        # 1 and 3 seconds
+        # create targets for t and t+3 models
         if get_targers:
             y1 = self.get_targets(x, future=0)
             y3 = self.get_targets(x, future=3)
@@ -113,42 +182,63 @@ def generate_featres(self, x, get_targers: bool = False):
 
         # additional featurers
         for c1, c2 in [
-            ('Zephyr_HR', 'Zephyr_HRV'),
-            ('Polar_bpm', 'Polar_hrv'),
-            ('Zephyr_HRV', 'Polar_hrv'),
+            ("Zephyr_HR", "Zephyr_HRV"),
+            ("Polar_bpm", "Polar_hrv"),
+            ("Zephyr_HRV", "Polar_hrv"),
         ]:
             x[f"{c1}_div_{c2}"] = x[c1] / x[c2]
 
         dfs = [x]
 
-        # rolling stats
+        # create rolling stats features
         windows = [5, 999_999]
         cols = list(x.columns)
         for w in windows:
-            rolling_mean = x.groupby("session_id")[cols].rolling(min_periods=1, window=w).mean().reset_index(drop=True)
-            rolling_std = x.groupby("session_id")[cols].rolling(min_periods=1, window=w).std().reset_index(drop=True)
+            rolling_mean = (
+                x.groupby("session_id")[cols]
+                .rolling(min_periods=1, window=w)
+                .mean()
+                .reset_index(drop=True)
+            )
+            rolling_std = (
+                x.groupby("session_id")[cols]
+                .rolling(min_periods=1, window=w)
+                .std()
+                .reset_index(drop=True)
+            )
             normed = (x - rolling_mean) / (rolling_std + 1)
-            
+
             normed = normed.add_prefix(f"normed_by_session_{w}_")
             rolling_mean = rolling_mean.add_prefix(f"mean_by_session_{w}_")
             rolling_std = rolling_std.add_prefix(f"std_by_session_{w}_")
-            
+
             dfs += [rolling_mean, rolling_std, normed]
 
+        # create global stats features
         windows = [999_999]
-        global_cols = cols # 
+        global_cols = cols
         for w in windows:
-            rolling_mean = x[global_cols].rolling(min_periods=2, window=w).mean().reset_index(drop=True)
-            rolling_std = x[global_cols].rolling(min_periods=2, window=w).std().reset_index(drop=True)
+            rolling_mean = (
+                x[global_cols]
+                .rolling(min_periods=2, window=w)
+                .mean()
+                .reset_index(drop=True)
+            )
+            rolling_std = (
+                x[global_cols]
+                .rolling(min_periods=2, window=w)
+                .std()
+                .reset_index(drop=True)
+            )
             normed = (x - rolling_mean) / (rolling_std + 1)
-            
+
             normed = normed.add_prefix(f"normed_global_{w}_")
             rolling_mean = rolling_mean.add_prefix(f"mean_global_{w}_")
             rolling_std = rolling_std.add_prefix(f"std_global_{w}_")
-            
+
             dfs += [rolling_mean, rolling_std, normed]
 
-        # shift features 
+        # shift features
         for s in [1, 3]:
             gr_s = x.groupby("session_id")[cols].shift(s).reset_index(drop=True)
             tmp = x - gr_s
@@ -164,7 +254,15 @@ def generate_featres(self, x, get_targers: bool = False):
         df = self.rename_cols(df)
         return df, y1, y3, session_id
 
-    def generate_featres_train(self, data):
+    def generate_features_train(
+        self, data: pd.DataFrame
+    ) -> Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]:
+        """
+        Generate features and targets for training
+
+        Args:
+            data (pd.DataFrame): raw input data
+        """
         # last index is time
         indexes = [i[:-1] for i in data.index]
         indexes = list(set(indexes))
@@ -174,14 +272,14 @@ def generate_featres_train(self, data):
         for index in tqdm(indexes):
             index_data_to_select = index  # use full index
             x = data.loc[index_data_to_select]
-            
-            r, y1, y3, session_id = self.generate_featres(x, get_targers=True)
-            
+
+            r, y1, y3, session_id = self.generate_features(x, get_targers=True)
+
             X.append(r)
             Y1.append(y1)
             Y3.append(y3)
-            
-            # group id = person + task 
+
+            # group id = person + task
             task = pd.Series([index] * r.shape[0])
             _meta = task.astype("str") + "__" + session_id.astype("str")
             META.append(_meta)
@@ -190,4 +288,4 @@ def generate_featres_train(self, data):
         Y1 = pd.concat(Y1, axis=0).reset_index(drop=True)
         Y3 = pd.concat(Y3, axis=0).reset_index(drop=True)
         META = pd.concat(META, axis=0).reset_index(drop=True)
-        return X, Y1, Y3, META
\ No newline at end of file
+        return X, Y1, Y3, META
diff --git a/code/topcoder_cognitive_state/test.py b/code/topcoder_cognitive_state/test.py
index cadbf18..025bc1d 100755
--- a/code/topcoder_cognitive_state/test.py
+++ b/code/topcoder_cognitive_state/test.py
@@ -1,20 +1,23 @@
+from typing import List
 import sys
 import warnings
-warnings.filterwarnings('ignore')
 import time
-
 import pickle
 
-import pandas as pd
 import numpy as np
+import pandas as pd
 from tqdm import tqdm
 
 from topcoder_cognitive_state.load_data import read_data
-from topcoder_cognitive_state.CONSTANTS import METADATA_COLUMNS
 from topcoder_cognitive_state.model import Model
 
+warnings.filterwarnings("ignore")
 
-def arrays_to_str_list(arr):
+
+def arrays_to_str_list(arr: np.array) -> List[str]:
+    """
+    Transform predictions arrays into a list of strings to match the submission format.
+    """
     result = []
     for i in range(arr.shape[0]):
         tmp = list(arr[i, :])
@@ -23,7 +26,10 @@ def arrays_to_str_list(arr):
     return result
 
 
-def lists_to_str_list(arr):
+def lists_to_str_list(arr: List[List[str]]) -> List[str]:
+    """
+    Transform most important features arrays into a list of strings to match submission format.
+    """
     result = []
     for tmp in arr:
         tmp = "[" + " ".join(["'" + str(s) + "'" for s in tmp]) + "]"
@@ -31,14 +37,33 @@ def lists_to_str_list(arr):
     return result
 
 
-def make_predictions_for_test_suite(data, test_suite, model):
-    y_hat_1, y_hat_3, y_hat_1_label, y_hat_3_label, most_important_features = model.predict(data)
+def make_predictions_for_test_suite(
+    data: pd.DataFrame, test_suite: pd.Series, model: Model
+) -> pd.DataFrame:
+    """
+    Make predictions for single `test_suite`
+
+    Args:
+        data (pd.DataFrame): test_suite input data
+        test_suite (pd.Series): test_suite value
+        model (Model): model
+
+    Returns:
+        pd.DataFrame: predictions
+    """
+    (
+        y_hat_1,
+        y_hat_3,
+        y_hat_1_label,
+        y_hat_3_label,
+        most_important_features,
+    ) = model.predict(data)
 
     # combine results
     result = pd.DataFrame({})
 
-    result['timestamp'] = data.reset_index()['timestamp']
-    result['test_suite'] = test_suite
+    result["timestamp"] = data.reset_index()["timestamp"]
+    result["test_suite"] = test_suite
 
     result["predicted_induced_state"] = y_hat_1_label
     result["three_sec_predicted_induced_state"] = y_hat_3_label
@@ -49,19 +74,32 @@ def make_predictions_for_test_suite(data, test_suite, model):
     result["top_three_features"] = lists_to_str_list(most_important_features)
 
     result_cols = [
-        'timestamp',
-        'test_suite',
-        'predicted_induced_state',
-        'predicted_induced_state_confidence',
-        'three_sec_predicted_induced_state',
-        'three_sec_predicted_induced_state_confidence',
-        'top_three_features'
+        "timestamp",
+        "test_suite",
+        "predicted_induced_state",
+        "predicted_induced_state_confidence",
+        "three_sec_predicted_induced_state",
+        "three_sec_predicted_induced_state_confidence",
+        "top_three_features",
     ]
     result = result[result_cols]
     return result
 
 
-def make_predictions(data, dummies, model):
+def make_predictions(
+    data: pd.DataFrame, dummies: pd.DataFrame, model: Model
+) -> pd.DataFrame:
+    """
+    Make predictions for raw input data
+
+    Args:
+        data (pd.DataFrame): input data
+        dummies (pd.DataFrame): dummies dataframe to match sample submission format
+        model (Model): model to make predictions
+
+    Returns:
+        pd.DataFrame: predictions
+    """
     t_start = time.time()
 
     # get unique
@@ -82,8 +120,10 @@ def make_predictions(data, dummies, model):
     result = pd.merge(dummies, result, how="left", on=["timestamp", "test_suite"])
 
     # process ts
-    result['timestamp'] = pd.to_datetime(result['timestamp']).apply(lambda x: x.value) / 10**3
-    result['timestamp'] = result['timestamp'].astype("int")
+    result["timestamp"] = (
+        pd.to_datetime(result["timestamp"]).apply(lambda x: x.value) / 10**3
+    )
+    result["timestamp"] = result["timestamp"].astype("int")
 
     t_end = time.time()
     print(f"Predicions are made. Time: {(t_end-t_start)/60:.2f} minutes")
@@ -94,18 +134,18 @@ def main():
     if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
         print("Testing input file is missing.")
         return 1
-    
+
     if len(sys.argv) < 3 or len(sys.argv[2]) == 0:
         print("Testing output file is missing.")
         return 1
-    
-    print('Testing started.')
+
+    print("Testing started.")
 
     input_file = sys.argv[1]
     output_file = sys.argv[2]
     model_file = sys.argv[3]
 
-    with open(model_file,'rb') as f:
+    with open(model_file, "rb") as f:
         model = pickle.load(f)
 
     # load data
@@ -114,5 +154,6 @@ def main():
     result.to_csv(output_file, index=False)
     return 0
 
+
 if __name__ == "__main__":
     main()
diff --git a/code/topcoder_cognitive_state/train.py b/code/topcoder_cognitive_state/train.py
index ee2e4bf..62315c5 100755
--- a/code/topcoder_cognitive_state/train.py
+++ b/code/topcoder_cognitive_state/train.py
@@ -1,24 +1,23 @@
-from pyexpat import features
-from typing import List, Tuple
+from typing import List, Optional, Dict, Tuple
 import sys
 import warnings
-warnings.filterwarnings('ignore')
-
 import pickle
-from functools import reduce
 
-import pandas as pd
 from sklearn.metrics import roc_auc_score
 import numpy as np
 from tqdm import tqdm
 from lightgbm import LGBMClassifier
+import pandas as pd
 
 from topcoder_cognitive_state.load_data import read_data
 from topcoder_cognitive_state.processing import FeaturesGenerator
 from sklearn.model_selection import StratifiedGroupKFold
 from topcoder_cognitive_state.model import Model
 
+warnings.filterwarnings("ignore")
+
 
+# lightgbm model hyperparams
 PARAMS = [
     {
         "num_leaves": 151,
@@ -32,43 +31,50 @@
         "reg_lambda": 9.90,
         "min_child_weight": 0.005519,
         "min_split_gain": 1.94e-14,
-        'random_state': 42
+        "random_state": 42,
     },
-
     {
-        'num_leaves': 79,
-        'learning_rate': 0.12,
-        'n_estimators': 600,
-        'min_child_samples': 14,
-        'subsample': 0.75,
-        'subsample_freq': 5,
-        'colsample_bytree': 0.75,
-        'reg_alpha': 2.2,
-        'reg_lambda': 1.5,
-        'random_state': 424242,
-        'min_child_weight': 6.681437316563333e-12,
-        'min_split_gain': 0.00039529173804292325,
+        "num_leaves": 79,
+        "learning_rate": 0.12,
+        "n_estimators": 600,
+        "min_child_samples": 14,
+        "subsample": 0.75,
+        "subsample_freq": 5,
+        "colsample_bytree": 0.75,
+        "reg_alpha": 2.2,
+        "reg_lambda": 1.5,
+        "random_state": 424242,
+        "min_child_weight": 6.681437316563333e-12,
+        "min_split_gain": 0.00039529173804292325,
     },
-
     {
-        'num_leaves': 23,
-        'learning_rate': 0.12,
-        'n_estimators': 500,
-        'min_child_samples': 30,
-        'subsample': 0.6,
-        'subsample_freq': 5,
-        'colsample_bytree': 0.4,
-        'reg_alpha': 0,
-        'reg_lambda': 0,
-        'random_state': 4242,
-        'min_child_weight': 0.28,
-        'min_split_gain': 9.793058539831146e-08,
-    }
-
+        "num_leaves": 23,
+        "learning_rate": 0.12,
+        "n_estimators": 500,
+        "min_child_samples": 30,
+        "subsample": 0.6,
+        "subsample_freq": 5,
+        "colsample_bytree": 0.4,
+        "reg_alpha": 0,
+        "reg_lambda": 0,
+        "random_state": 4242,
+        "min_child_weight": 0.28,
+        "min_split_gain": 9.793058539831146e-08,
+    },
 ]
 
 
-def get_auc(y_true, probas_pred):
+def get_auc(y_true: np.array, probas_pred: np.array) -> Tuple[str, float, bool]:
+    """
+    Calculate avg auc for multiclass classification
+
+    Args:
+        y_true (np.array): array of labels
+        probas_pred (np.array): array of predicted probs
+
+    Returns:
+        Tuple[str, float, bool]: name of metrics, the value of metric, higher = better?
+    """
     aucs = []
     preds = np.array(probas_pred)
     preds = preds.reshape(-1, 6)
@@ -79,14 +85,39 @@ def get_auc(y_true, probas_pred):
     return "mean_auc", score, True
 
 
-def drop_null_targets(X_raw, y_raw):
+def drop_null_targets(
+    X_raw: pd.DataFrame, y_raw: pd.Series
+) -> Tuple[pd.DataFrame, pd.Series]:
+    """
+    Drop all rows where target is missing
+
+    Args:
+        X_raw (pd.DataFrame): features
+        y_raw (pd.Series): targets
+
+    Returns:
+        Tuple[pd.DataFrame, pd.Series]: features and targets without rows with missing data
+    """
     X = X_raw.reset_index(drop=True)
     y = y_raw.reset_index(drop=True)
     ind = y.notnull()
     return X.loc[ind], y[ind]
 
 
-def train_model(X, Y1, Y3):
+def train_model(
+    X: pd.DataFrame, Y1: pd.Series, Y3: pd.Series
+) -> Tuple[List[LGBMClassifier], List[LGBMClassifier]]:
+    """
+    Train models for t and t+3 predictions
+
+    Args:
+        X (pd.DataFrame): input features
+        Y1 (pd.Series): t targets
+        Y3 (pd.Series): t+3 targets
+
+    Returns:
+        List[LGBMClassifier]: list of trained models
+    """
     models_1 = []
     models_3 = []
     for params in PARAMS:
@@ -101,19 +132,47 @@ def train_model(X, Y1, Y3):
     return models_1, models_3
 
 
-def train_models(X, Y1, Y3, META, params_to_train=None):
+def train_models(
+    X: pd.Dataframe,
+    Y1: pd.Series,
+    Y3: pd.Series,
+    META: pd.Series,
+    params_to_train: Optional[List[Dict]] = None,
+) -> Tuple[List[LGBMClassifier], List[LGBMClassifier], float]:
+    """
+    Train models
+
+    Args:
+        X (pd.DataFrame): input features
+        Y1 (pd.Series): t targets
+        Y3 (pd.Series): t+3 targets
+        META (pd.Series): metadata
+        params_to_train (Optional[List[Dict]], optional): list of dicts of hyperparams to use for training. Defaults to None.
+
+    Returns:
+        Tuple[List[LGBMClassifier], List[LGBMClassifier], float]: t and t+3 trained models, mean validation score
+    """
     if params_to_train is None:
         params_to_train = PARAMS
 
-    folds = list(StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42).split(X, Y1, META))
-    folds += list(StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=100).split(X, Y1, META))
+    # use repeated statified group kfold for training
+    folds = list(
+        StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42).split(
+            X, Y1, META
+        )
+    )
+    folds += list(
+        StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=100).split(
+            X, Y1, META
+        )
+    )
 
     models_1, models_3 = [], []
     for params in params_to_train:
         print("Training model with new params")
         print(params)
 
-        scores_1, scores_3 = [], []
+        scores_1 = []
         fin_scores = []
         for fold_ind, (train_index, test_index) in tqdm(enumerate(folds)):
             # get split
@@ -121,30 +180,31 @@ def train_models(X, Y1, Y3, META, params_to_train=None):
             y_1_train, y_1_test = Y1[train_index], Y1[test_index]
             y_3_train, y_3_test = Y3[train_index], Y3[test_index]
             # meta_train, meta_val = META[train_index], META[test_index]
-            
+
             # drop null targets
             X_train_1, y_train_1 = drop_null_targets(X_train, y_1_train)
-            X_train_3, y_train_3 = drop_null_targets(X_train, y_3_train)
+            _, _ = drop_null_targets(X_train, y_3_train)
             X_val_1, y_val_1 = drop_null_targets(X_test, y_1_test)
-            X_val_3, y_val_3 = drop_null_targets(X_test, y_3_test)
-            
+            _, _ = drop_null_targets(X_test, y_3_test)
+
             try:
                 # train models
                 model_1 = LGBMClassifier(**params)
                 model_1.fit(
-                    X_train_1, y_train_1, 
-                    eval_set=(X_val_1, y_val_1), 
-                    # eval_metric=get_auc,
+                    X_train_1,
+                    y_train_1,
+                    eval_set=(X_val_1, y_val_1),
                     verbose=100,
-                    early_stopping_rounds=50
+                    early_stopping_rounds=50,
                 )
-                auc_1 = get_auc(y_true=y_val_1, probas_pred=model_1.predict_proba(X_val_1))[1]
+                auc_1 = get_auc(
+                    y_true=y_val_1, probas_pred=model_1.predict_proba(X_val_1)
+                )[1]
                 scores_1.append(auc_1)
                 models_1.append(model_1)
 
                 models_3 = models_1
                 auc_3 = auc_1
-                scores_3 = scores_1
 
                 fin_score = 0.7 * auc_1 + 0.3 * auc_3
                 fin_scores.append(fin_score)
@@ -159,35 +219,39 @@ def train_models(X, Y1, Y3, META, params_to_train=None):
 
 
 def main():
+    """
+    Run training
+    """
     if len(sys.argv) < 2 or len(sys.argv[1]) == 0:
         print("Training input file is missing.")
         return 1
-    
+
     if len(sys.argv) < 3 or len(sys.argv[2]) == 0:
         print("Training output file is missing.")
         return 1
 
-    print('Training started.')
-    
+    print("Training started.")
+
     input_file = sys.argv[1]
     output_file = sys.argv[2]
 
     data, _ = read_data(input_file)
     processor = FeaturesGenerator()
-    X, Y1, Y3, META = processor.generate_featres_train(data)
+    X, Y1, Y3, _ = processor.generate_features_train(data)
     models_1, models_3 = train_model(X, Y1, Y3)
 
     main_model = Model(
         features=list(X.columns),
-        preprocessor=processor, 
-        models_1=models_1, 
-        models_3=models_3
+        preprocessor=processor,
+        models_1=models_1,
+        models_3=models_3,
     )
-    with open(output_file, 'wb') as f:
+    with open(output_file, "wb") as f:
         pickle.dump(main_model, f, pickle.HIGHEST_PROTOCOL)
 
-    print('Training finished.')
+    print("Training finished.")
     return 0
 
+
 if __name__ == "__main__":
     main()
diff --git a/code/topcoder_cognitive_state/utils.py b/code/topcoder_cognitive_state/utils.py
deleted file mode 100644
index e69de29..0000000