From 4d105e14aec10b0cc3d53cc037973490c63591b6 Mon Sep 17 00:00:00 2001
From: rchan <rchan@turing.ac.uk>
Date: Mon, 14 Aug 2023 14:57:59 +0100
Subject: [PATCH 1/5] add option to have additional features

---
 src/nlpsig/classification_utils.py |   1 -
 src/nlpsig/data_preparation.py     | 273 ++++++++++++++++-------------
 tests/test_data_preparation.py     |  96 +++++-----
 tests/test_padding.py              | 122 ++++++-------
 4 files changed, 256 insertions(+), 236 deletions(-)

diff --git a/src/nlpsig/classification_utils.py b/src/nlpsig/classification_utils.py
index dc24203..93bc5df 100644
--- a/src/nlpsig/classification_utils.py
+++ b/src/nlpsig/classification_utils.py
@@ -110,7 +110,6 @@ def __init__(
         else:
             if self.groups is not None:
                 # see https://github.com/scikit-learn/scikit-learn/issues/9193
-                print("[INFO] Splitting data by provided groups")
                 self.shuffle = False
 
                 if x_data.shape[0] != len(self.groups):
diff --git a/src/nlpsig/data_preparation.py b/src/nlpsig/data_preparation.py
index 77bda80..129c48c 100644
--- a/src/nlpsig/data_preparation.py
+++ b/src/nlpsig/data_preparation.py
@@ -97,7 +97,7 @@ def __init__(
 
         self.pooled_embeddings: np.array | None = pooled_embeddings
         # obtain time features
-        self._time_feature_choices: list[str] = []
+        self._feature_list: list[str] = []
         self.time_features_added: bool = False
         self.df = self._set_time_features()
         self.df_padded: pd.DataFrame | None = None
@@ -207,13 +207,13 @@ def _set_time_features(self) -> pd.DataFrame:
             return None
         print("[INFO] Adding time feature columns into dataframe in `.df`.")
         if "datetime" in self.df.columns:
-            self._time_feature_choices += ["time_encoding", "time_diff"]
+            self._feature_list += ["time_encoding", "time_diff"]
 
             # checking 'datetime' column is datatime type
             self.df["datetime"] = pd.to_datetime(self.df["datetime"])
 
             # obtain time encoding by computing the fraction of year it is in
-            print("[INFO] Adding 'time_encoding' and feature...")
+            print("[INFO] Adding 'time_encoding' feature...")
             self.df["time_encoding"] = self.df["datetime"].map(
                 lambda t: self._time_fraction(t)
             )
@@ -224,7 +224,7 @@ def _set_time_features(self) -> pd.DataFrame:
             self.df = self.df.sort_values(by=[self.id_column, "datetime"])
 
             # calculate time difference between posts
-            print("[INFO] Adding 'time_diff' and feature...")
+            print("[INFO] Adding 'time_diff' feature...")
             self.df["time_diff"] = list(
                 self.df.groupby(self.id_column)
                 .apply(
@@ -249,7 +249,7 @@ def _set_time_features(self) -> pd.DataFrame:
                 "we assume that the data is ordered by time with respect to the id."
             )
         # assign index for each post in each timeline
-        self._time_feature_choices += ["timeline_index"]
+        self._feature_list += ["timeline_index"]
 
         print("[INFO] Adding 'timeline_index' feature...")
         self.df["timeline_index"] = list(
@@ -266,7 +266,7 @@ def _set_time_features(self) -> pd.DataFrame:
 
         return self.df
 
-    def _obtain_colnames(self, embeddings: str) -> list[str]:
+    def _obtain_embedding_colnames(self, embeddings: str) -> list[str]:
         """
         [Private] Obtains the column names storing the embeddings.
 
@@ -308,62 +308,85 @@ def _obtain_colnames(self, embeddings: str) -> list[str]:
 
         return colnames
 
-    def _obtain_time_feature_columns(
+    def _check_feature_exists(self, feature: str) -> bool:
+        """
+        [Private] Checks if `feature` is a column in `self._feature_list`. If not,
+        check if `self.df` dataframe and if it is, add this to `self._feature_list`.
+
+        Parameters
+        ----------
+        feature : str
+            Feature name.
+
+        Returns
+        -------
+        bool
+            True if `feature` is in `self._feature_list` or is a
+            column name in `self.df`.
+        """
+        if (feature not in self._feature_list) and (feature in self.df.columns):
+            # not in ._feature_list, but is a valid column name in self.df,
+            # so add to feature list
+            self._feature_list += [feature]
+            
+        return feature in self._feature_list
+    
+    def _obtain_feature_columns(
         self,
-        time_feature: list[str] | str | None,
+        features: list[str] | str | None,
     ) -> list[str]:
         """
-        [Private] Obtains the column names storing the time features requested.
+        [Private] Obtains the column names storing the feature(s) requested.
         If a string or list is passed, it essentially just checks if it is an
-        available time feature that is stored in `_time_feature_choices` and returns
-        the time features in a list.
+        available feature that is stored in `_feature_list` and returns
+        the feature(s) in a list.
 
         Parameters
         ----------
-        time_feature : list[str] | str | None
+        features : list[str] | str | None
             If is a string, it must be in the list found in
-            `_time_feature_choices` attribute. If is a list,
+            `_feature_list` attribute. If is a list,
             each item must be a string and it must be in the
-            list found in `_time_feature_choices` attribute.
+            list found in `_feature_list` attribute.
 
         Returns
         -------
         list[str]
-            List of column names which store the time features.
+            List of column names which store the feature(s).
 
         Raises
         ------
         ValueError
-            if `time_feature` is a string, and it is not found in `_time_feature_choices`.
-        ValueError
-            if `time_feature` is a list of strings, and one of the items
-            is not found in `_time_feature_choices`.
-        TypeError
-            if `time_feature` is neither a string or a list.
+            if `features` is a string, and it is not found in
+            `_feature_list` attribute or if `features` is a
+            list of strings, and one of the items
+            is not found in `_feature_list` attribute.
         """
-        if time_feature is None:
-            time_feature = []
+        if features is None:
+            # no features are wanted, return an empty list
+            features = []
         else:
-            if not self.time_features_added:
-                self.set_time_features()
-            if isinstance(time_feature, str):
-                if time_feature not in self._time_feature_choices:
-                    raise ValueError(
-                        "If `time_feature` is a string, it must "
-                        f"be in {self._time_feature_choices}."
-                    )
-                time_feature = [time_feature]
-            elif isinstance(time_feature, list):
-                if not all(item in self._time_feature_choices for item in time_feature):
-                    raise ValueError(
-                        f"Each item in `time_feature` should be in {self._time_feature_choices}."
-                    )
+            # convert to list of strings
+            if isinstance(features, str):
+                features = [features]
+            
+            if isinstance(features, list):    
+                # check each item in features is in self._feature_list
+                # if it isn't, but is a column in self.df, it will add
+                # it to self._feature_list
+                for item in features:
+                    if not self._check_feature_exists(feature=item):
+                        raise ValueError(
+                            f"{item} must be in `self.feature_list`: {self._feature_list},"
+                            "or a column in `self.df`."
+                        )
             else:
+                # features is neither None, a string or a list
                 raise TypeError(
                     "`time_feature` must be either None, a string, or a list of strings."
                 )
 
-        return time_feature
+        return features
 
     def _pad_dataframe(
         self,
@@ -371,7 +394,7 @@ def _pad_dataframe(
         k: int,
         zero_padding: bool,
         colnames: list[str],
-        time_feature: list[str],
+        features: list[str],
         id: int,
         pad_from_below: bool,
     ) -> pd.DataFrame:
@@ -393,8 +416,8 @@ def _pad_dataframe(
             text associated to the id.
         colnames : list[str]
             List of column names that we wish to keep from the dataframe.
-        time_feature : list[str]
-            List of time feature column names that we wish to keep from the dataframe.
+        features : list[str]
+            List of feature column names that we wish to keep from the dataframe.
         id : int
             Which id are we padding.
         pad_from_below: bool
@@ -412,7 +435,7 @@ def _pad_dataframe(
         """
         if k <= 0:
             raise ValueError("`k` must be a positive integer.")
-        columns = time_feature + colnames + [self.id_column]
+        columns = features + colnames + [self.id_column]
         if self.label_column is not None:
             columns += [self.label_column]
 
@@ -424,7 +447,7 @@ def _pad_dataframe(
                 if self.label_column is not None:
                     # set labels to be -1 to indicate that they're padded values
                     data_dict = {
-                        **dict.fromkeys(time_feature, [0]),
+                        **dict.fromkeys(features, [0]),
                         **{c: [0] for c in colnames},
                         self.id_column: [id],
                         self.label_column: [-1],
@@ -432,7 +455,7 @@ def _pad_dataframe(
                 else:
                     # no label column to add
                     data_dict = {
-                        **dict.fromkeys(time_feature, [0]),
+                        **dict.fromkeys(features, [0]),
                         **{c: [0] for c in colnames},
                         self.id_column: [id],
                     }
@@ -471,7 +494,7 @@ def _pad_id(
         k: int,
         zero_padding: bool,
         colnames: list[str],
-        time_feature: list[str],
+        features: list[str],
         id: int,
         pad_from_below: bool,
     ) -> pd.DataFrame:
@@ -497,8 +520,8 @@ def _pad_id(
             text associated to the id.
         colnames : list[str]
             List of column names that we wish to keep from the dataframe.
-        time_feature : list[str]
-            List of time feature column names that we wish to keep from the dataframe.
+        features : list[str]
+            List of feature column names that we wish to keep from the dataframe.
         id : int
             Which id are we padding.
         pad_from_below: bool
@@ -523,7 +546,7 @@ def _pad_id(
             k=k,
             zero_padding=zero_padding,
             colnames=colnames,
-            time_feature=time_feature,
+            features=features,
             id=id,
             pad_from_below=pad_from_below,
         )
@@ -533,7 +556,7 @@ def _pad_history(
         k: int,
         zero_padding: bool,
         colnames: list[str],
-        time_feature: list[str],
+        features: list[str],
         index: int,
         include_current_embedding: bool,
         pad_from_below: bool,
@@ -559,8 +582,8 @@ def _pad_history(
             text associated to the id.
         colnames : list[str]
             List of column names that we wish to keep from the dataframe.
-        time_feature : list[str]
-            List of time feature column names that we wish to keep from the dataframe.
+        features : list[str]
+            List of features column names that we wish to keep from the dataframe.
         index : int
             Which index of the dataframe are we padding.
         pad_from_below: bool
@@ -606,7 +629,7 @@ def _pad_history(
             k=k,
             zero_padding=zero_padding,
             colnames=colnames,
-            time_feature=time_feature,
+            features=features,
             id=id,
             pad_from_below=pad_from_below,
         )
@@ -653,7 +676,7 @@ def pad(
         method: str = "k_last",
         zero_padding: bool = True,
         k: int = 5,
-        time_feature: list[str] | str | None = None,
+        features: list[str] | str | None = None,
         standardise_method: list[str] | str | None = None,
         embeddings: str = "full",
         include_current_embedding: bool = True,
@@ -697,10 +720,10 @@ def pad(
             text associated to the id.
         k : int, optional
             The requested length of the path, default 5. This is ignored if `method="max"`.
-        time_feature : list[str] | str | None, optional
-            Which time feature(s) to keep. If None, then doesn't keep any.
+        features : list[str] | str | None, optional
+            Which feature(s) to keep. If None, then doesn't keep any.
         standardise_method : str | None, optional
-            If not None, applies standardisation to the time features, default None. Options:
+            If not None, applies standardisation to the features, default None. Options:
 
             - "standardise": transforms by subtracting the mean and dividing by standard deviation
             - "normalise": transforms by dividing by the sum
@@ -750,35 +773,35 @@ def pad(
         else:
             raise ValueError("`method` must be either 'k_last' or 'max'.")
 
-        # obtain time feature colnames
-        time_feature_colnames = self._obtain_time_feature_columns(
-            time_feature=time_feature
+        # obtain feature colnames
+        feature_colnames = self._obtain_feature_columns(
+            features=features
         )
-        if len(time_feature_colnames) > 0:
+        if len(feature_colnames) > 0:
             if isinstance(standardise_method, str):
-                standardise_method = [standardise_method] * len(time_feature_colnames)
+                standardise_method = [standardise_method] * len(feature_colnames)
             elif isinstance(standardise_method, list) and (
-                len(standardise_method) != len(time_feature_colnames)
+                len(standardise_method) != len(feature_colnames)
             ):
                 raise ValueError(
                     "if `standardise_method` is a list, it must have the same length "
-                    f"as the number of time features requested: {len(time_feature_colnames)}."
+                    f"as the number of features requested: {len(feature_colnames)}."
                 )
 
         if standardise_method is not None:
-            # standardises the time features in .df
+            # standardises the features in .df
             self.standardise_transform = {}
-            for i in range(len(time_feature_colnames)):
+            for i in range(len(feature_colnames)):
                 standardise = self._standardise_pd(
-                    vec=self.df[time_feature_colnames[i]], method=standardise_method[i]
+                    vec=self.df[feature_colnames[i]], method=standardise_method[i]
                 )
-                self.standardise_transform[time_feature_colnames[i]] = standardise[
+                self.standardise_transform[feature_colnames[i]] = standardise[
                     "transform"
                 ]
-                self.df[time_feature_colnames[i]] = standardise["standardised_pd"]
+                self.df[feature_colnames[i]] = standardise["standardised_pd"]
 
         # obtain colnames of embeddings
-        colnames = self._obtain_colnames(embeddings=embeddings)
+        colnames = self._obtain_embedding_colnames(embeddings=embeddings)
 
         if pad_by == "id":
             # pad each of the ids in id_column and store them in a list
@@ -787,7 +810,7 @@ def pad(
                     k=k,
                     zero_padding=zero_padding,
                     colnames=colnames,
-                    time_feature=time_feature_colnames,
+                    features=feature_colnames,
                     id=id,
                     pad_from_below=pad_from_below,
                 )
@@ -801,7 +824,7 @@ def pad(
                     k=k,
                     zero_padding=zero_padding,
                     colnames=colnames,
-                    time_feature=time_feature_colnames,
+                    features=feature_colnames,
                     index=index,
                     include_current_embedding=include_current_embedding,
                     pad_from_below=pad_from_below,
@@ -828,7 +851,7 @@ def get_time_feature(
     ) -> dict[str, np.array | Callable | None]:
         """
         Returns a `np.array` object of the time_feature that is requested
-        (the string passed has to be one of the strings in `._time_feature_choices`).
+        (the string passed has to be one of the strings in `._feature_list`).
 
         Parameters
         ----------
@@ -850,11 +873,11 @@ def get_time_feature(
         ------
         ValueError
             if `time_feature` is not in the possible time_features
-            (can be found in `._time_feature_choices` attribute).
+            (can be found in `._feature_list` attribute).
         """
-        if time_feature not in self._time_feature_choices:
+        if time_feature not in self._feature_list:
             raise ValueError(
-                f"`time_feature` should be in {self._time_feature_choices}."
+                f"`time_feature` should be in {self._feature_list}."
             )
 
         if not self.time_features_added:
@@ -873,15 +896,15 @@ def get_time_feature(
 
         return {"time_feature": np.array(self.df[time_feature]), "transform": None}
 
-    def get_path(self, include_time_features: bool = True) -> np.array:
+    def get_path(self, include_features: bool = True) -> np.array:
         """
         Returns a `np.array` object of the path.
-        Includes the time features by default (if they are present after the padding).
+        Includes the features by default (if they are present after the padding).
 
         Parameters
         ----------
-        include_time_features : bool, optional
-            Whether or not to keep the time features, by default True.
+        include_features : bool, optional
+            Whether or not to keep the features, by default True.
 
         Returns
         -------
@@ -906,14 +929,14 @@ def get_path(self, include_time_features: bool = True) -> np.array:
             # (which stores id_column)
             path = self.array_padded[:, :, :-1]
 
-        if not include_time_features:
-            # computes how many time features there are currently
-            # (which occur in the first n_time_features columns)
-            n_time_features = len(
-                [item for item in self._time_feature_choices if item in self.df_padded]
+        if not include_features:
+            # computes how many features there are currently
+            # (which occur in the first n_features columns)
+            n_features = len(
+                [item for item in self._feature_list if item in self.df_padded]
             )
-            # removes any time features (if they're present)
-            path = path[:, :, n_time_features:]
+            # removes any features (if they're present)
+            path = path[:, :, n_features:]
 
         return path.astype("float")
 
@@ -945,8 +968,8 @@ def get_embeddings(self, reduced_embeddings: bool = False) -> np.array:
 
     def get_torch_path_for_SWNUNetwork(
         self,
-        include_time_features_in_path: bool,
-        include_time_features_in_input: bool,
+        include_features_in_path: bool,
+        include_features_in_input: bool,
         include_embedding_in_input: bool,
         reduced_embeddings: bool = False,
     ) -> tuple[torch.tensor, int]:
@@ -955,10 +978,10 @@ def get_torch_path_for_SWNUNetwork(
 
         Parameters
         ----------
-        include_time_features_in_path : bool
-            Whether or not to keep time features within the path.
-        include_time_features_in_input : bool
-            Whether or not to concatenate the time feature into the feed-forward neural
+        include_features_in_path : bool
+            Whether or not to keep the additional features (e.g. time features) within the path.
+        include_features_in_input : bool
+            Whether or not to concatenate the additional features into the feed-forward neural
             network in the `nlpsig_networks.SWNUNetwork` model.
         include_embedding_in_input : bool
             Whether or not to concatenate the embeddings into the feed-forward neural
@@ -984,10 +1007,10 @@ def get_torch_path_for_SWNUNetwork(
             raise ValueError("Need to first call to create the path `.pad()`.")
 
         # obtains a torch tensor which can be inputted into deepsignet
-        # computes how many time features there are currently
-        # (which occur in the first n_time_features columns)
-        n_time_features = len(
-            [item for item in self._time_feature_choices if item in self.df_padded]
+        # computes how many features there are currently
+        # (which occur in the first n_features columns)
+        n_features = len(
+            [item for item in self._feature_list if item in self.df_padded]
         )
 
         if include_embedding_in_input:
@@ -1047,39 +1070,39 @@ def get_torch_path_for_SWNUNetwork(
                 .transpose(1, 2)
             )
 
-        if include_time_features_in_path:
-            # make sure path includes the time features
-            path = torch.from_numpy(self.get_path(include_time_features=True))
+        if include_features_in_path:
+            # make sure path includes the features
+            path = torch.from_numpy(self.get_path(include_features=True))
             input_channels = path.shape[2]
-            if include_time_features_in_input:
-                # need to repeat the time feature columns
-                # if there are no time features, then we don't need to repeat anything
-                if n_time_features == 1:
+            if include_features_in_input:
+                # need to repeat the feature columns
+                # if there are no features, then we don't need to repeat anything
+                if n_features == 1:
                     path = torch.cat([path, path[:, :, 0].unsqueeze(2)], dim=2)
-                elif n_time_features > 1:
-                    path = torch.cat([path, path[:, :, 0:n_time_features]], dim=2)
+                elif n_features > 1:
+                    path = torch.cat([path, path[:, :, 0:n_features]], dim=2)
         else:
-            if include_time_features_in_input:
-                # path doesn't need to include the time features
+            if include_features_in_input:
+                # path doesn't need to include the features
                 # but we still want to include them in the input to the FFN for classification
-                path = torch.from_numpy(self.get_path(include_time_features=True))
-                input_channels = path.shape[2] - n_time_features
-                # need to move time features to the end of the path
-                # if there are no time features, then we don't need to move anything
-                if n_time_features == 1:
+                path = torch.from_numpy(self.get_path(include_features=True))
+                input_channels = path.shape[2] - n_features
+                # need to move features to the end of the path
+                # if there are no features, then we don't need to move anything
+                if n_features == 1:
                     path = torch.cat(
-                        [path[:, :, n_time_features:], path[:, :, 0].unsqueeze(2)],
+                        [path[:, :, n_features:], path[:, :, 0].unsqueeze(2)],
                         dim=2,
                     )
-                elif n_time_features > 1:
+                elif n_features > 1:
                     path = torch.cat(
-                        [path[:, :, n_time_features:], path[:, :, 0:n_time_features]],
+                        [path[:, :, n_features:], path[:, :, 0:n_features]],
                         dim=2,
                     )
             else:
-                # path doesn't need to include the time features
+                # path doesn't need to include the features
                 # and don't need to include them in the input to the FFN for classification
-                path = torch.from_numpy(self.get_path(include_time_features=False))
+                path = torch.from_numpy(self.get_path(include_features=False))
                 input_channels = path.shape[2]
 
         if include_embedding_in_input:
@@ -1138,8 +1161,8 @@ def get_torch_path_for_SeqSigNet(
         shift: int,
         window_size: int,
         n: int,
-        include_time_features_in_path: bool,
-        include_time_features_in_input: bool,
+        include_features_in_path: bool,
+        include_features_in_input: bool,
         include_embedding_in_input: bool,
         reduced_embeddings: bool = False,
     ) -> tuple[torch.tensor, int]:
@@ -1154,10 +1177,10 @@ def get_torch_path_for_SeqSigNet(
             Size of the window we use over the texts.
         n : int
             Number of units we wish to use in SeqSigNet.
-        include_time_features_in_path : bool
-            Whether or not to keep time features within the path.
-        include_time_features_in_input : bool
-            Whether or not to concatenate the time feature into the feed-forward neural
+        include_features_in_path : bool
+            Whether or not to keep the additional features (e.g. time features) within the path.
+        include_features_in_input : bool
+            Whether or not to concatenate the additional features into the feed-forward neural
             network in the `nlpsig_networks.SeqSigNet` model.
         include_embedding_in_input : bool
             Whether or not to concatenate the embeddings into the feed-forward neural
@@ -1194,8 +1217,8 @@ def get_torch_path_for_SeqSigNet(
 
         # obtain 3 dimensional tensor with dimensions [batch, history, channels]
         swnu_path, input_channels = self.get_torch_path_for_SWNUNetwork(
-            include_time_features_in_path=include_time_features_in_path,
-            include_time_features_in_input=include_time_features_in_input,
+            include_features_in_path=include_features_in_path,
+            include_features_in_input=include_features_in_input,
             include_embedding_in_input=include_embedding_in_input,
             reduced_embeddings=reduced_embeddings,
         )
diff --git a/tests/test_data_preparation.py b/tests/test_data_preparation.py
index a1d99f5..b9fabaf 100644
--- a/tests/test_data_preparation.py
+++ b/tests/test_data_preparation.py
@@ -31,11 +31,11 @@ def test_default_initialisation_datetime(
         1
         + len(obj.original_df.columns)
         + emb.shape[1]
-        + len(obj._time_feature_choices)
+        + len(obj._feature_list)
         + 1,
     )
     assert obj.pooled_embeddings is None
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
@@ -69,11 +69,11 @@ def test_default_initialisation_no_time(
         len(obj.original_df.index),
         len(obj.original_df.columns)
         + emb.shape[1]
-        + len(obj._time_feature_choices)
+        + len(obj._feature_list)
         + 1,
     )
     assert obj.pooled_embeddings is None
-    assert obj._time_feature_choices == ["timeline_index"]
+    assert obj._feature_list == ["timeline_index"]
     assert obj.time_features_added
     assert obj.df_padded is None
     assert obj.array_padded is None
@@ -108,10 +108,10 @@ def test_initialisation_with_id_and_label_datetime(
         1
         + len(obj.original_df.columns)
         + emb.shape[1]
-        + len(obj._time_feature_choices),
+        + len(obj._feature_list),
     )
     assert obj.pooled_embeddings is None
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
@@ -148,10 +148,10 @@ def test_initialisation_with_id_and_label_no_time(
     # 1 label column
     assert obj.df.shape == (
         len(obj.original_df.index),
-        len(obj.original_df.columns) + emb.shape[1] + len(obj._time_feature_choices),
+        len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list),
     )
     assert obj.pooled_embeddings is None
-    assert obj._time_feature_choices == ["timeline_index"]
+    assert obj._feature_list == ["timeline_index"]
     assert obj.time_features_added
     assert obj.df_padded is None
     assert obj.array_padded is None
@@ -205,11 +205,11 @@ def test_initialisation_with_reduced_emb_datetime(
         + len(obj.original_df.columns)
         + emb.shape[1]
         + emb_reduced.shape[1]
-        + len(obj._time_feature_choices)
+        + len(obj._feature_list)
         + 1,
     )
     assert obj.pooled_embeddings is None
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
@@ -248,11 +248,11 @@ def test_initialisation_with_reduced_emb_no_time(
         len(obj.original_df.columns)
         + emb.shape[1]
         + emb_reduced.shape[1]
-        + len(obj._time_feature_choices)
+        + len(obj._feature_list)
         + 1,
     )
     assert obj.pooled_embeddings is None
-    assert obj._time_feature_choices == ["timeline_index"]
+    assert obj._feature_list == ["timeline_index"]
     assert obj.time_features_added
     assert obj.df_padded is None
     assert obj.array_padded is None
@@ -275,9 +275,6 @@ def test_initialisation_with_pooled_emb_datetime(
         pooled_embeddings=emb_pooled,
         id_column="id_col",
     )
-    # should have an error as we haven't passed in the id column,
-    # and so it expects the number of rows in emb_pooled to
-    # equal the number of rows in the dataframe
     pd.testing.assert_frame_equal(obj.original_df, test_df_with_datetime)
     assert obj.id_column == "id_col"
     assert obj.label_column is None
@@ -297,10 +294,10 @@ def test_initialisation_with_pooled_emb_datetime(
         + len(obj.original_df.columns)
         + emb.shape[1]
         + emb_reduced.shape[1]
-        + len(obj._time_feature_choices),
+        + len(obj._feature_list),
     )
     assert (obj.pooled_embeddings == emb_pooled).all()
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
@@ -327,9 +324,6 @@ def test_initialisation_with_pooled_emb_no_time(
         pooled_embeddings=emb_pooled,
         id_column="id_col",
     )
-    # should have an error as we haven't passed in the id column,
-    # and so it expects the number of rows in emb_pooled to
-    # equal the number of rows in the dataframe
     pd.testing.assert_frame_equal(obj.original_df, test_df_no_time)
     assert obj.id_column == "id_col"
     assert obj.label_column is None
@@ -347,10 +341,10 @@ def test_initialisation_with_pooled_emb_no_time(
         len(obj.original_df.columns)
         + emb.shape[1]
         + emb_reduced.shape[1]
-        + len(obj._time_feature_choices),
+        + len(obj._feature_list),
     )
     assert (obj.pooled_embeddings == emb_pooled).all()
-    assert obj._time_feature_choices == ["timeline_index"]
+    assert obj._feature_list == ["timeline_index"]
     assert obj.time_features_added
     assert obj.df_padded is None
     assert obj.array_padded is None
@@ -500,14 +494,14 @@ def test_PrepareData_obtain_colnames_emb(test_df_with_datetime, emb):
 
     # test cases where only embeddings are passed
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
-    assert obj._obtain_colnames(embeddings="full") == emb_names
-    assert obj._obtain_colnames(embeddings="dim_reduced") == []
-    assert obj._obtain_colnames(embeddings="both") == emb_names
+    assert obj._obtain_embedding_colnames(embeddings="full") == emb_names
+    assert obj._obtain_embedding_colnames(embeddings="dim_reduced") == []
+    assert obj._obtain_embedding_colnames(embeddings="both") == emb_names
 
     with pytest.raises(
         ValueError, match="Embeddings must be either 'dim_reduced', 'full', or 'both'"
     ):
-        obj._obtain_colnames(embeddings="")
+        obj._obtain_embedding_colnames(embeddings="")
 
 
 def test_obtain_colnames_both(test_df_with_datetime, emb, emb_reduced):
@@ -520,85 +514,89 @@ def test_obtain_colnames_both(test_df_with_datetime, emb, emb_reduced):
         embeddings=emb,
         embeddings_reduced=emb_reduced,
     )
-    assert obj._obtain_colnames(embeddings="full") == emb_names
-    assert obj._obtain_colnames(embeddings="dim_reduced") == emb_reduced_names
-    assert obj._obtain_colnames(embeddings="both") == emb_reduced_names + emb_names
+    assert obj._obtain_embedding_colnames(embeddings="full") == emb_names
+    assert obj._obtain_embedding_colnames(embeddings="dim_reduced") == emb_reduced_names
+    assert obj._obtain_embedding_colnames(embeddings="both") == emb_reduced_names + emb_names
 
 
-def test_obtain_time_feature_columns_string(test_df_with_datetime, emb):
+def test_obtain_feature_columns_string(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
     }
-    assert obj._obtain_time_feature_columns("timeline_index") == ["timeline_index"]
+    assert obj._obtain_feature_columns("timeline_index") == ["timeline_index"]
 
 
-def test_obtain_time_feature_columns_list(test_df_with_datetime, emb):
+def test_obtain_feature_columns_list(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
     }
-    assert obj._obtain_time_feature_columns(["time_encoding", "timeline_index"]) == [
+    assert obj._obtain_feature_columns(["time_encoding", "timeline_index"]) == [
         "time_encoding",
         "timeline_index",
     ]
 
 
-def test_obtain_time_feature_columns_none(test_df_with_datetime, emb):
+def test_obtain_feature_columns_none(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
     }
-    assert obj._obtain_time_feature_columns(None) == []
+    assert obj._obtain_feature_columns(None) == []
 
 
-def test_obtain_time_feature_columns_string_not_in(test_df_with_datetime, emb):
+def test_obtain_feature_columns_string_not_in(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
     }
+    incorrect_name = "TEST_COLUMN"
     with pytest.raises(
         ValueError,
         match=re.escape(
-            f"If `time_feature` is a string, it must be in {obj._time_feature_choices}."
+            f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list},"
+            "or a column in `self.df`."
         ),
     ):
-        obj._obtain_time_feature_columns("TEST_COLUMN")
+        obj._obtain_feature_columns(incorrect_name)
 
 
-def test_obtain_time_feature_columns_list_not_in(test_df_with_datetime, emb):
+def test_obtain_feature_columns_list_not_in(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
     }
+    incorrect_name = "TEST_COLUMN"
     with pytest.raises(
         ValueError,
         match=re.escape(
-            f"Each item in `time_feature` should be in {obj._time_feature_choices}."
+            f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list},"
+            "or a column in `self.df`."
         ),
     ):
-        obj._obtain_time_feature_columns(["timeline_index", "TEST_COLUMN"])
+        obj._obtain_feature_columns(["timeline_index", incorrect_name])
 
 
-def test_obtain_time_feature_columns_type(test_df_with_datetime, emb):
+def test_obtain_feature_columns_type(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
-    assert set(obj._time_feature_choices) == {
+    assert set(obj._feature_list) == {
         "time_encoding",
         "time_diff",
         "timeline_index",
@@ -607,7 +605,7 @@ def test_obtain_time_feature_columns_type(test_df_with_datetime, emb):
         TypeError,
         match="`time_feature` must be either None, a string, or a list of strings.",
     ):
-        obj._obtain_time_feature_columns(0)
+        obj._obtain_feature_columns(0)
 
 
 def test_standardise_pd_standardise(vec_to_standardise, test_df_no_time, emb):
diff --git a/tests/test_padding.py b/tests/test_padding.py
index 177ebee..17040bc 100644
--- a/tests/test_padding.py
+++ b/tests/test_padding.py
@@ -22,7 +22,7 @@ def test_pad_dataframe_zero_padding_from_below_without_label(test_df_no_time, em
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=True,
     )
@@ -61,7 +61,7 @@ def test_pad_dataframe_zero_padding_from_below_with_label(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=True,
     )
@@ -99,7 +99,7 @@ def test_pad_dataframe_zero_padding_from_above_without_label(test_df_no_time, em
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -137,7 +137,7 @@ def test_pad_dataframe_zero_padding_from_above_with_label(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -176,7 +176,7 @@ def test_pad_dataframe_non_zero_padding_from_below(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=True,
     )
@@ -208,7 +208,7 @@ def test_pad_dataframe_non_zero_padding_from_above(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -236,8 +236,8 @@ def test_pad_dataframe_k_equal_zero(test_df_no_time, test_df_to_pad, emb):
             df=test_df_to_pad,
             k=0,
             zero_padding=False,
-            colnames=obj._obtain_colnames("full"),
-            time_feature=["timeline_index"],
+            colnames=obj._obtain_embedding_colnames("full"),
+            features=["timeline_index"],
             id=0,
             pad_from_below=False,
         )
@@ -257,8 +257,8 @@ def test_pad_dataframe_k_negative(test_df_no_time, test_df_to_pad, emb):
             df=test_df_to_pad,
             k=-1,
             zero_padding=False,
-            colnames=obj._obtain_colnames("full"),
-            time_feature=["timeline_index"],
+            colnames=obj._obtain_embedding_colnames("full"),
+            features=["timeline_index"],
             id=0,
             pad_from_below=False,
         )
@@ -279,7 +279,7 @@ def test_pad_dataframe_no_pad(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -304,7 +304,7 @@ def test_pad_dataframe_cutoff(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -326,8 +326,8 @@ def test_pad_id_k_equal_zero(test_df_no_time, emb):
         obj._pad_id(
             k=-1,
             zero_padding=False,
-            colnames=obj._obtain_colnames("full"),
-            time_feature=["timeline_index"],
+            colnames=obj._obtain_embedding_colnames("full"),
+            features=["timeline_index"],
             id=0,
             pad_from_below=False,
         )
@@ -345,8 +345,8 @@ def test_pad_id_k_negative(test_df_no_time, emb):
         obj._pad_id(
             k=-1,
             zero_padding=False,
-            colnames=obj._obtain_colnames("full"),
-            time_feature=["timeline_index"],
+            colnames=obj._obtain_embedding_colnames("full"),
+            features=["timeline_index"],
             id=0,
             pad_from_below=False,
         )
@@ -366,7 +366,7 @@ def test_pad_id_zero_padding_from_below(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=True,
     )
@@ -402,7 +402,7 @@ def test_pad_id_zero_padding_from_above(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -443,7 +443,7 @@ def test_pad_id_non_zero_padding_from_below(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=True,
     )
@@ -477,7 +477,7 @@ def test_pad_id_non_zero_padding_from_above(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -508,7 +508,7 @@ def test_pad_id_no_pad(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -536,7 +536,7 @@ def test_pad_id_cutoff(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         id=0,
         pad_from_below=False,
     )
@@ -557,7 +557,7 @@ def test_pad_history_zero_padding_no_history_from_below(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=True,
@@ -587,7 +587,7 @@ def test_pad_history_zero_padding_no_history_from_above(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=False,
@@ -619,7 +619,7 @@ def test_pad_history_zero_padding_some_history_from_below(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=True,
@@ -655,7 +655,7 @@ def test_pad_history_zero_padding_some_history_from_above(test_df_no_time, emb):
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=False,
@@ -688,7 +688,7 @@ def test_pad_history_non_zero_padding_no_history_from_below(test_df_no_time, emb
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=True,
@@ -718,7 +718,7 @@ def test_pad_history_non_zero_padding_no_history_from_above(test_df_no_time, emb
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=False,
@@ -750,7 +750,7 @@ def test_pad_history_non_zero_padding_some_history_from_below(test_df_no_time, e
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=True,
@@ -780,7 +780,7 @@ def test_pad_history_non_zero_padding_some_history_from_above(test_df_no_time, e
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=False,
@@ -814,7 +814,7 @@ def test_pad_history_just_enough_history(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=False,
@@ -845,7 +845,7 @@ def test_pad_history_many_history_cutoff(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=False,
         pad_from_below=False,
@@ -872,7 +872,7 @@ def test_pad_history_no_history_zero_padding_include_current_from_below(
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=True,
@@ -911,7 +911,7 @@ def test_pad_history_no_history_zero_padding_include_current_from_above(
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=False,
@@ -950,7 +950,7 @@ def test_pad_history_some_history_zero_padding_include_current_from_below(
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=True,
@@ -990,7 +990,7 @@ def test_pad_history_some_history_zero_padding_include_current_from_above(
         k=k,
         zero_padding=True,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=False,
@@ -1028,7 +1028,7 @@ def test_pad_history_no_history_non_zero_padding_include_current_from_below(
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=True,
@@ -1056,7 +1056,7 @@ def test_pad_history_no_history_non_zero_padding_include_current_from_above(
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=False,
@@ -1086,7 +1086,7 @@ def test_pad_history_some_history_non_zero_padding_include_current_from_below(
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=True,
@@ -1119,7 +1119,7 @@ def test_pad_history_some_history_non_zero_padding_include_current_from_above(
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=False,
@@ -1152,7 +1152,7 @@ def test_pad_history_just_enough_history_include_current(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=False,
@@ -1182,7 +1182,7 @@ def test_pad_history_many_history_include_current(test_df_no_time, emb):
         k=k,
         zero_padding=False,
         colnames=colnames,
-        time_feature=["timeline_index"],
+        features=["timeline_index"],
         index=index,
         include_current_embedding=True,
         pad_from_below=False,
@@ -1206,7 +1206,7 @@ def test_pad_by_id_k_last(test_df_no_time, emb):
         method="k_last",
         zero_padding=True,
         k=k,
-        time_feature="timeline_index",
+        features="timeline_index",
         standardise_method=None,
         embeddings="full",
         include_current_embedding=True,
@@ -1214,7 +1214,7 @@ def test_pad_by_id_k_last(test_df_no_time, emb):
     )
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1
+    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
     assert type(obj.array_padded) == np.ndarray
@@ -1233,7 +1233,7 @@ def test_pad_by_id_max(test_df_no_time, emb):
         pad_by="id",
         method="max",
         zero_padding=True,
-        time_feature="timeline_index",
+        features="timeline_index",
         standardise_method=None,
         embeddings="full",
         include_current_embedding=True,
@@ -1241,7 +1241,7 @@ def test_pad_by_id_max(test_df_no_time, emb):
     )
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1
+    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     k = obj.original_df["id_col"].value_counts().max()
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
@@ -1263,7 +1263,7 @@ def test_pad_by_history_k_last(test_df_no_time, emb):
         method="k_last",
         zero_padding=True,
         k=k,
-        time_feature="timeline_index",
+        features="timeline_index",
         standardise_method=None,
         embeddings="full",
         include_current_embedding=True,
@@ -1271,7 +1271,7 @@ def test_pad_by_history_k_last(test_df_no_time, emb):
     )
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1
+    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol)
     assert type(obj.array_padded) == np.ndarray
@@ -1290,7 +1290,7 @@ def test_pad_by_history_max(test_df_no_time, emb):
         pad_by="history",
         method="max",
         zero_padding=True,
-        time_feature="timeline_index",
+        features="timeline_index",
         standardise_method=None,
         embeddings="full",
         include_current_embedding=True,
@@ -1298,7 +1298,7 @@ def test_pad_by_history_max(test_df_no_time, emb):
     )
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1
+    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     k = obj.original_df["id_col"].value_counts().max()
     assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol)
@@ -1319,7 +1319,7 @@ def test_pad_wrong_pad_by(test_df_no_time, emb):
             pad_by="fake_pad_by",
             method="max",
             zero_padding=True,
-            time_feature="timeline_index",
+            features="timeline_index",
             standardise_method=None,
             embeddings="full",
             include_current_embedding=True,
@@ -1339,7 +1339,7 @@ def test_pad_wrong_method(test_df_no_time, emb):
             pad_by="id",
             method="fake_method",
             zero_padding=True,
-            time_feature="timeline_index",
+            features="timeline_index",
             standardise_method=None,
             embeddings="full",
             include_current_embedding=True,
@@ -1360,7 +1360,7 @@ def test_pad_by_id_k_last_standardise_standardise(test_df_no_time, emb):
         method="k_last",
         zero_padding=True,
         k=k,
-        time_feature="timeline_index",
+        features="timeline_index",
         standardise_method="standardise",
         embeddings="full",
         include_current_embedding=True,
@@ -1372,7 +1372,7 @@ def test_pad_by_id_k_last_standardise_standardise(test_df_no_time, emb):
     pd.testing.assert_series_equal(obj.df["timeline_index"], standardise_vec)
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1
+    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
     assert type(obj.array_padded) == np.ndarray
@@ -1393,7 +1393,7 @@ def test_pad_by_id_k_last_standardise_normalise(test_df_no_time, emb):
         method="k_last",
         zero_padding=True,
         k=k,
-        time_feature="timeline_index",
+        features="timeline_index",
         standardise_method="normalise",
         embeddings="full",
         include_current_embedding=True,
@@ -1405,7 +1405,7 @@ def test_pad_by_id_k_last_standardise_normalise(test_df_no_time, emb):
     pd.testing.assert_series_equal(obj.df["timeline_index"], normalise_vec)
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1
+    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
     assert type(obj.array_padded) == np.ndarray
@@ -1426,7 +1426,7 @@ def test_pad_by_id_k_last_standardise_minmax(test_df_no_time, emb):
         method="k_last",
         zero_padding=True,
         k=k,
-        time_feature="timeline_index",
+        features="timeline_index",
         standardise_method="minmax",
         embeddings="full",
         include_current_embedding=True,
@@ -1438,7 +1438,7 @@ def test_pad_by_id_k_last_standardise_minmax(test_df_no_time, emb):
     pd.testing.assert_series_equal(obj.df["timeline_index"], minmax_vec)
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1
+    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
     assert type(obj.array_padded) == np.ndarray
@@ -1454,7 +1454,7 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb):
         label_column="label_col",
     )
     k = 10
-    time_features = ["timeline_index", "time_encoding", "time_diff"]
+    featuress = ["timeline_index", "time_encoding", "time_diff"]
     # expected standardised vectors
     standardised_vec = obj._standardise_pd(
         vec=obj.df["timeline_index"], method="standardise"
@@ -1469,7 +1469,7 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb):
         method="k_last",
         zero_padding=True,
         k=k,
-        time_feature=time_features,
+        features=featuress,
         standardise_method=["standardise", "normalise", None],
         embeddings="full",
         include_current_embedding=True,
@@ -1480,7 +1480,7 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb):
     pd.testing.assert_series_equal(obj.df["time_diff"], none_standardisation_vec)
     # number of columns is:
     # number of time features + number of columns in emb + id col + label col
-    ncol = len(time_features) + emb.shape[1] + 1 + 1
+    ncol = len(featuress) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
     assert type(obj.array_padded) == np.ndarray

From f146263855678004cbed008cb75c557fe128bced Mon Sep 17 00:00:00 2001
From: rchan <rchan@turing.ac.uk>
Date: Mon, 14 Aug 2023 15:20:38 +0100
Subject: [PATCH 2/5] fix some print spacing

---
 src/nlpsig/data_preparation.py | 2 +-
 tests/test_data_preparation.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nlpsig/data_preparation.py b/src/nlpsig/data_preparation.py
index 129c48c..0836be4 100644
--- a/src/nlpsig/data_preparation.py
+++ b/src/nlpsig/data_preparation.py
@@ -377,7 +377,7 @@ def _obtain_feature_columns(
                 for item in features:
                     if not self._check_feature_exists(feature=item):
                         raise ValueError(
-                            f"{item} must be in `self.feature_list`: {self._feature_list},"
+                            f"{item} must be in `self.feature_list`: {self._feature_list}, "
                             "or a column in `self.df`."
                         )
             else:
diff --git a/tests/test_data_preparation.py b/tests/test_data_preparation.py
index b9fabaf..b16cdf7 100644
--- a/tests/test_data_preparation.py
+++ b/tests/test_data_preparation.py
@@ -567,7 +567,7 @@ def test_obtain_feature_columns_string_not_in(test_df_with_datetime, emb):
     with pytest.raises(
         ValueError,
         match=re.escape(
-            f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list},"
+            f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list}, "
             "or a column in `self.df`."
         ),
     ):
@@ -586,7 +586,7 @@ def test_obtain_feature_columns_list_not_in(test_df_with_datetime, emb):
     with pytest.raises(
         ValueError,
         match=re.escape(
-            f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list},"
+            f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list}, "
             "or a column in `self.df`."
         ),
     ):

From 698a8902f87f9a208b96aa3a414e8789c51c72d4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 14 Aug 2023 23:54:57 +0000
Subject: [PATCH 3/5] Bump pypa/gh-action-pypi-publish from 1.8.7 to 1.8.10

Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.7 to 1.8.10.
- [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases)
- [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.7...v1.8.10)

---
updated-dependencies:
- dependency-name: pypa/gh-action-pypi-publish
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 96776c9..c437f6c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -85,7 +85,7 @@ jobs:
       - name: Check products
         run: pipx run twine check dist/*
 
-      - uses: pypa/gh-action-pypi-publish@v1.8.7
+      - uses: pypa/gh-action-pypi-publish@v1.8.10
         if: github.event_name == 'release' && github.event.action == 'published'
         with:
           # Remember to generate this and set it in "GitHub Secrets"

From 3bc31d30de6d5adad0780009c1e0bd4c5114626d Mon Sep 17 00:00:00 2001
From: rchan <rchan@turing.ac.uk>
Date: Tue, 15 Aug 2023 09:42:34 +0100
Subject: [PATCH 4/5] tests for additional features

---
 src/nlpsig/data_preparation.py |  27 +++--
 tests/conftest.py              |   6 ++
 tests/test_data_preparation.py |  87 ++++++++++++++--
 tests/test_padding.py          | 178 +++++++++++++++++++++++++++------
 4 files changed, 246 insertions(+), 52 deletions(-)

diff --git a/src/nlpsig/data_preparation.py b/src/nlpsig/data_preparation.py
index 0836be4..3aeaa2a 100644
--- a/src/nlpsig/data_preparation.py
+++ b/src/nlpsig/data_preparation.py
@@ -639,18 +639,18 @@ def _standardise_pd(
         vec: pd.Series, method: str | None
     ) -> dict[str, pd.Series | Callable]:
         # standardised pandas series
-        implemented = ["standardise", "normalise", "minmax", None]
+        implemented = ["z_score", "sum_divide", "minmax", None]
         if method not in implemented:
-            raise ValueError(f"`method` must be in {implemented}.")
+            raise ValueError(f"`method`: {method} must be in {implemented}.")
 
-        if method == "standardise":
+        if method == "z_score":
             mean = vec.mean()
             std = vec.std()
 
             def transform(x):
                 return (x - mean) / std
 
-        elif method == "normalise":
+        elif method == "sum_divide":
             sum = vec.sum()
 
             def transform(x):
@@ -722,11 +722,14 @@ def pad(
             The requested length of the path, default 5. This is ignored if `method="max"`.
         features : list[str] | str | None, optional
             Which feature(s) to keep. If None, then doesn't keep any.
-        standardise_method : str | None, optional
-            If not None, applies standardisation to the features, default None. Options:
+        standardise_method : list[str] | str | None, optional
+            If not None, applies standardisation to the features, default None.
+            If a list is passed, must be the same length as `features`. Options:
 
-            - "standardise": transforms by subtracting the mean and dividing by standard deviation
-            - "normalise": transforms by dividing by the sum
+            - "z_score": transforms by subtracting the mean and dividing by standard deviation
+            - "sum_divide": transforms by dividing by the sum
+            - "minmax": transform by return (x-min(x)) / (max(x)-min(x)) where x
+              is the vector to standardise
 
         embeddings : str, optional
             Which embeddings to keep, by default "full". Options:
@@ -847,7 +850,7 @@ def pad(
     def get_time_feature(
         self,
         time_feature: str = "timeline_index",
-        standardise_method: str = "standardise",
+        standardise_method: str = "z_score",
     ) -> dict[str, np.array | Callable | None]:
         """
         Returns a `np.array` object of the time_feature that is requested
@@ -859,8 +862,10 @@ def get_time_feature(
             Which time feature to obtain `np.array` for, by default "timeline_index".
         standardise_method : str | None, optional
             If not None, applies standardisation to the time features, default None. Options:
-            - "standardise": transforms by subtracting the mean and dividing by standard deviation
-            - "normalise": transforms by dividing by the sum
+            - "z_score": transforms by subtracting the mean and dividing by standard deviation
+            - "sum_divide": transforms by dividing by the sum
+            - "minmax": transform by return (x-min(x)) / (max(x)-min(x)) where x
+              is the vector to standardise
 
         Returns
         -------
diff --git a/tests/conftest.py b/tests/conftest.py
index b45f3e4..5d3b0cc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -29,6 +29,8 @@ def test_df_with_datetime():
     return pd.DataFrame(
         {
             "text": [f"text_{i}" for i in range(n_entries)],
+            "binary_var": [rng.choice([0,1]) for i in range(n_entries)],
+            "continuous_var": rng.random(n_entries),
             "id_col": [0 for i in range(100)]
             + [rng.integers(1, 5) for i in range(n_entries - 100)],
             "label_col": [rng.integers(0, 4) for i in range(n_entries)],
@@ -43,6 +45,8 @@ def test_df_no_time():
     return pd.DataFrame(
         {
             "text": [f"text_{i}" for i in range(n_entries)],
+            "binary_var": [rng.choice([0,1]) for i in range(n_entries)],
+            "continuous_var": rng.random(n_entries),
             "id_col": [0 for i in range(100)]
             + [rng.integers(1, 5) for i in range(n_entries - 100)],
             "label_col": [rng.integers(0, 4) for i in range(n_entries)],
@@ -56,6 +60,8 @@ def test_df_to_pad():
     return pd.DataFrame(
         {
             "text": [f"text_{i}" for i in range(n_entries)],
+            "binary_var": [rng.choice([0,1]) for i in range(n_entries)],
+            "continuous_var": rng.random(n_entries),
             "id_col": 0,
             "label_col": [rng.integers(0, 4) for i in range(n_entries)],
         }
diff --git a/tests/test_data_preparation.py b/tests/test_data_preparation.py
index b16cdf7..3f43c68 100644
--- a/tests/test_data_preparation.py
+++ b/tests/test_data_preparation.py
@@ -530,6 +530,46 @@ def test_obtain_feature_columns_string(test_df_with_datetime, emb):
     assert obj._obtain_feature_columns("timeline_index") == ["timeline_index"]
 
 
+def test_obtain_feature_columns_string_additional_binary(test_df_with_datetime, emb):
+    # default initialisation
+    obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
+    # originally only have the time features
+    assert set(obj._feature_list) == {
+        "time_encoding",
+        "time_diff",
+        "timeline_index",
+    }
+    # pass in string of column name that isn't in _feature_list but
+    # is a column in self.df
+    assert obj._obtain_feature_columns("binary_var") == ["binary_var"]
+    assert set(obj._feature_list) == {
+        "time_encoding",
+        "time_diff",
+        "timeline_index",
+        "binary_var",
+    }
+    
+
+def test_obtain_feature_columns_string_additional_continuous(test_df_with_datetime, emb):
+    # default initialisation
+    obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
+    # originally only have the time features
+    assert set(obj._feature_list) == {
+        "time_encoding",
+        "time_diff",
+        "timeline_index",
+    }
+    # pass in string of column name that isn't in _feature_list but
+    # is a column in self.df
+    assert obj._obtain_feature_columns("continuous_var") == ["continuous_var"]
+    assert set(obj._feature_list) == {
+        "time_encoding",
+        "time_diff",
+        "timeline_index",
+        "continuous_var",
+    }
+
+
 def test_obtain_feature_columns_list(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
@@ -544,6 +584,30 @@ def test_obtain_feature_columns_list(test_df_with_datetime, emb):
     ]
 
 
+def test_obtain_feature_columns_list_additional(test_df_with_datetime, emb):
+    # default initialisation
+    obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
+    assert set(obj._feature_list) == {
+        "time_encoding",
+        "time_diff",
+        "timeline_index",
+    }
+    assert obj._obtain_feature_columns(["time_encoding", "timeline_index", "binary_var", "continuous_var"]) == [
+        "time_encoding",
+        "timeline_index",
+        "binary_var",
+        "continuous_var",
+    ]
+    # check that it has added binary_var and continuous_var to ._feature_list
+    assert set(obj._feature_list) == {
+        "time_encoding",
+        "time_diff",
+        "timeline_index",
+        "binary_var",
+        "continuous_var",
+    }
+
+
 def test_obtain_feature_columns_none(test_df_with_datetime, emb):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
@@ -608,10 +672,10 @@ def test_obtain_feature_columns_type(test_df_with_datetime, emb):
         obj._obtain_feature_columns(0)
 
 
-def test_standardise_pd_standardise(vec_to_standardise, test_df_no_time, emb):
-    # testing _standardise_pd with method=="standardise"
+def test_standardise_pd_z_score(vec_to_standardise, test_df_no_time, emb):
+    # testing _standardise_pd with method=="z_score"
     obj = PrepareData(original_df=test_df_no_time, embeddings=emb)
-    standardise = obj._standardise_pd(vec=vec_to_standardise, method="standardise")
+    standardise = obj._standardise_pd(vec=vec_to_standardise, method="z_score")
     assert type(standardise) == dict
     assert type(standardise["standardised_pd"]) == pd.Series
     pd.testing.assert_series_equal(
@@ -622,10 +686,10 @@ def test_standardise_pd_standardise(vec_to_standardise, test_df_no_time, emb):
     )
 
 
-def test_standardise_pd_normalise(vec_to_standardise, test_df_no_time, emb):
-    # testing _standardise_pd with method=="normalise"
+def test_standardise_pd_sum_divide(vec_to_standardise, test_df_no_time, emb):
+    # testing _standardise_pd with method=="sum_divide"
     obj = PrepareData(original_df=test_df_no_time, embeddings=emb)
-    standardise = obj._standardise_pd(vec=vec_to_standardise, method="normalise")
+    standardise = obj._standardise_pd(vec=vec_to_standardise, method="sum_divide")
     assert type(standardise) == dict
     assert type(standardise["standardised_pd"]) == pd.Series
     pd.testing.assert_series_equal(
@@ -637,7 +701,7 @@ def test_standardise_pd_normalise(vec_to_standardise, test_df_no_time, emb):
 
 
 def test_standardise_pd_minmax(vec_to_standardise, test_df_no_time, emb):
-    # testing _standardise_pd with method=="normalise"
+    # testing _standardise_pd with method=="sum_divide"
     obj = PrepareData(original_df=test_df_no_time, embeddings=emb)
     standardise = obj._standardise_pd(vec=vec_to_standardise, method="minmax")
     assert type(standardise) == dict
@@ -652,7 +716,7 @@ def test_standardise_pd_minmax(vec_to_standardise, test_df_no_time, emb):
 
 
 def test_standardise_pd_None(vec_to_standardise, test_df_no_time, emb):
-    # testing _standardise_pd with method=="normalise"
+    # testing _standardise_pd with method=="sum_divide"
     obj = PrepareData(original_df=test_df_no_time, embeddings=emb)
     standardise = obj._standardise_pd(vec=vec_to_standardise, method=None)
     assert type(standardise) == dict
@@ -666,9 +730,10 @@ def test_standardise_pd_None(vec_to_standardise, test_df_no_time, emb):
 
 def test_standardise_pd_wrong_method(vec_to_standardise, test_df_no_time, emb):
     # testing _standardise_pd with method that isn't implemented
-    implemented = ["standardise", "normalise", "minmax", None]
+    implemented = ["z_score", "sum_divide", "minmax", None]
     obj = PrepareData(original_df=test_df_no_time, embeddings=emb)
+    incorrect_method = "fake_method"
     with pytest.raises(
-        ValueError, match=re.escape(f"`method` must be in {implemented}.")
+        ValueError, match=re.escape(f"`method`: {incorrect_method} must be in {implemented}.")
     ):
-        obj._standardise_pd(vec=vec_to_standardise, method="fake_method")
+        obj._standardise_pd(vec=vec_to_standardise, method=incorrect_method)
diff --git a/tests/test_padding.py b/tests/test_padding.py
index 17040bc..deb7b0d 100644
--- a/tests/test_padding.py
+++ b/tests/test_padding.py
@@ -1193,9 +1193,9 @@ def test_pad_history_many_history_include_current(test_df_no_time, emb):
     )
 
 
-def test_pad_by_id_k_last(test_df_no_time, emb):
+def test_pad_by_id_k_last(test_df_with_datetime, emb):
     obj = PrepareData(
-        original_df=test_df_no_time,
+        original_df=test_df_with_datetime,
         embeddings=emb,
         id_column="id_col",
         label_column="label_col",
@@ -1213,8 +1213,8 @@ def test_pad_by_id_k_last(test_df_no_time, emb):
         pad_from_below=True,
     )
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
+    # timeline_index column + number of columns in emb + id col + label col
+    ncol = 1 + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
     assert type(obj.array_padded) == np.ndarray
@@ -1222,9 +1222,39 @@ def test_pad_by_id_k_last(test_df_no_time, emb):
     assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol)
 
 
-def test_pad_by_id_max(test_df_no_time, emb):
+def test_pad_by_id_k_last_additional(test_df_with_datetime, emb):
     obj = PrepareData(
-        original_df=test_df_no_time,
+        original_df=test_df_with_datetime,
+        embeddings=emb,
+        id_column="id_col",
+        label_column="label_col",
+    )
+    k = 10
+    features = ["timeline_index", "binary_var", "continuous_var"]
+    padded_array = obj.pad(
+        pad_by="id",
+        method="k_last",
+        zero_padding=True,
+        k=k,
+        features=features,
+        standardise_method=None,
+        embeddings="full",
+        include_current_embedding=True,
+        pad_from_below=True,
+    )
+    # number of columns is:
+    # number of features requested + number of columns in emb + id col + label col
+    ncol = len(features) + emb.shape[1] + 1 + 1
+    assert type(obj.df_padded) == pd.DataFrame
+    assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
+    assert type(obj.array_padded) == np.ndarray
+    assert np.array_equal(padded_array, obj.array_padded)
+    assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol)
+    
+
+def test_pad_by_id_max(test_df_with_datetime, emb):
+    obj = PrepareData(
+        original_df=test_df_with_datetime,
         embeddings=emb,
         id_column="id_col",
         label_column="label_col",
@@ -1240,8 +1270,37 @@ def test_pad_by_id_max(test_df_no_time, emb):
         pad_from_below=True,
     )
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
+    # timeline_index column + number of columns in emb + id col + label col
+    ncol = 1 + emb.shape[1] + 1 + 1
+    assert type(obj.df_padded) == pd.DataFrame
+    k = obj.original_df["id_col"].value_counts().max()
+    assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
+    assert type(obj.array_padded) == np.ndarray
+    assert np.array_equal(padded_array, obj.array_padded)
+    assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol)
+    
+
+def test_pad_by_id_max_additional(test_df_with_datetime, emb):
+    obj = PrepareData(
+        original_df=test_df_with_datetime,
+        embeddings=emb,
+        id_column="id_col",
+        label_column="label_col",
+    )
+    features = ["timeline_index", "binary_var", "continuous_var"]
+    padded_array = obj.pad(
+        pad_by="id",
+        method="max",
+        zero_padding=True,
+        features=features,
+        standardise_method=None,
+        embeddings="full",
+        include_current_embedding=True,
+        pad_from_below=True,
+    )
+    # number of columns is:
+    # number of features requested + number of columns in emb + id col + label col
+    ncol = len(features) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     k = obj.original_df["id_col"].value_counts().max()
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
@@ -1250,9 +1309,9 @@ def test_pad_by_id_max(test_df_no_time, emb):
     assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol)
 
 
-def test_pad_by_history_k_last(test_df_no_time, emb):
+def test_pad_by_history_k_last(test_df_with_datetime, emb):
     obj = PrepareData(
-        original_df=test_df_no_time,
+        original_df=test_df_with_datetime,
         embeddings=emb,
         id_column="id_col",
         label_column="label_col",
@@ -1270,8 +1329,8 @@ def test_pad_by_history_k_last(test_df_no_time, emb):
         pad_from_below=True,
     )
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
+    # timeline_index column + number of columns in emb + id col + label col
+    ncol = 1 + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol)
     assert type(obj.array_padded) == np.ndarray
@@ -1279,9 +1338,39 @@ def test_pad_by_history_k_last(test_df_no_time, emb):
     assert obj.array_padded.shape == (len(obj.original_df.index), k, ncol)
 
 
-def test_pad_by_history_max(test_df_no_time, emb):
+def test_pad_by_history_k_last_additional(test_df_with_datetime, emb):
     obj = PrepareData(
-        original_df=test_df_no_time,
+        original_df=test_df_with_datetime,
+        embeddings=emb,
+        id_column="id_col",
+        label_column="label_col",
+    )
+    k = 10
+    features = ["timeline_index", "binary_var", "continuous_var"]
+    padded_array = obj.pad(
+        pad_by="history",
+        method="k_last",
+        zero_padding=True,
+        k=k,
+        features=features,
+        standardise_method=None,
+        embeddings="full",
+        include_current_embedding=True,
+        pad_from_below=True,
+    )
+    # number of columns is:
+    # number of features requested + number of columns in emb + id col + label col
+    ncol = len(features) + emb.shape[1] + 1 + 1
+    assert type(obj.df_padded) == pd.DataFrame
+    assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol)
+    assert type(obj.array_padded) == np.ndarray
+    assert np.array_equal(padded_array, obj.array_padded)
+    assert obj.array_padded.shape == (len(obj.original_df.index), k, ncol)
+
+
+def test_pad_by_history_max(test_df_with_datetime, emb):
+    obj = PrepareData(
+        original_df=test_df_with_datetime,
         embeddings=emb,
         id_column="id_col",
         label_column="label_col",
@@ -1297,8 +1386,37 @@ def test_pad_by_history_max(test_df_no_time, emb):
         pad_from_below=True,
     )
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
-    ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
+    # timeline_index column + number of columns in emb + id col + label col
+    ncol = 1 + emb.shape[1] + 1 + 1
+    assert type(obj.df_padded) == pd.DataFrame
+    k = obj.original_df["id_col"].value_counts().max()
+    assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol)
+    assert type(obj.array_padded) == np.ndarray
+    assert np.array_equal(padded_array, obj.array_padded)
+    assert obj.array_padded.shape == (len(obj.original_df.index), k, ncol)
+
+
+def test_pad_by_history_max_additional(test_df_with_datetime, emb):
+    obj = PrepareData(
+        original_df=test_df_with_datetime,
+        embeddings=emb,
+        id_column="id_col",
+        label_column="label_col",
+    )
+    features = ["timeline_index", "binary_var", "continuous_var"]
+    padded_array = obj.pad(
+        pad_by="history",
+        method="max",
+        zero_padding=True,
+        features=features,
+        standardise_method=None,
+        embeddings="full",
+        include_current_embedding=True,
+        pad_from_below=True,
+    )
+    # number of columns is:
+    # number of features requested + number of columns in emb + id col + label col
+    ncol = len(features) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     k = obj.original_df["id_col"].value_counts().max()
     assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol)
@@ -1361,17 +1479,17 @@ def test_pad_by_id_k_last_standardise_standardise(test_df_no_time, emb):
         zero_padding=True,
         k=k,
         features="timeline_index",
-        standardise_method="standardise",
+        standardise_method="z_score",
         embeddings="full",
         include_current_embedding=True,
         pad_from_below=True,
     )
     standardise_vec = obj._standardise_pd(
-        vec=obj.df["timeline_index"], method="standardise"
+        vec=obj.df["timeline_index"], method="z_score"
     )["standardised_pd"]
     pd.testing.assert_series_equal(obj.df["timeline_index"], standardise_vec)
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
+    # number of features + number of columns in emb + id col + label col
     ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
@@ -1394,17 +1512,17 @@ def test_pad_by_id_k_last_standardise_normalise(test_df_no_time, emb):
         zero_padding=True,
         k=k,
         features="timeline_index",
-        standardise_method="normalise",
+        standardise_method="sum_divide",
         embeddings="full",
         include_current_embedding=True,
         pad_from_below=True,
     )
     normalise_vec = obj._standardise_pd(
-        vec=obj.df["timeline_index"], method="normalise"
+        vec=obj.df["timeline_index"], method="sum_divide"
     )["standardised_pd"]
     pd.testing.assert_series_equal(obj.df["timeline_index"], normalise_vec)
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
+    # number of features + number of columns in emb + id col + label col
     ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
@@ -1437,7 +1555,7 @@ def test_pad_by_id_k_last_standardise_minmax(test_df_no_time, emb):
     ]
     pd.testing.assert_series_equal(obj.df["timeline_index"], minmax_vec)
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
+    # number of features + number of columns in emb + id col + label col
     ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
@@ -1454,13 +1572,13 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb):
         label_column="label_col",
     )
     k = 10
-    featuress = ["timeline_index", "time_encoding", "time_diff"]
+    features = ["timeline_index", "time_encoding", "time_diff"]
     # expected standardised vectors
     standardised_vec = obj._standardise_pd(
-        vec=obj.df["timeline_index"], method="standardise"
+        vec=obj.df["timeline_index"], method="z_score"
     )["standardised_pd"]
     normalised_vec = obj._standardise_pd(
-        vec=obj.df["time_encoding"], method="normalise"
+        vec=obj.df["time_encoding"], method="sum_divide"
     )["standardised_pd"]
     none_standardisation_vec = obj.df["time_diff"]
     # pad and perform standardisation
@@ -1469,8 +1587,8 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb):
         method="k_last",
         zero_padding=True,
         k=k,
-        features=featuress,
-        standardise_method=["standardise", "normalise", None],
+        features=features,
+        standardise_method=["z_score", "sum_divide", None],
         embeddings="full",
         include_current_embedding=True,
         pad_from_below=True,
@@ -1479,8 +1597,8 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb):
     pd.testing.assert_series_equal(obj.df["time_encoding"], normalised_vec)
     pd.testing.assert_series_equal(obj.df["time_diff"], none_standardisation_vec)
     # number of columns is:
-    # number of time features + number of columns in emb + id col + label col
-    ncol = len(featuress) + emb.shape[1] + 1 + 1
+    # number of features + number of columns in emb + id col + label col
+    ncol = len(features) + emb.shape[1] + 1 + 1
     assert type(obj.df_padded) == pd.DataFrame
     assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol)
     assert type(obj.array_padded) == np.ndarray

From bd7fb1d47a95cc16017a382005a45004a774de67 Mon Sep 17 00:00:00 2001
From: rchan <rchan@turing.ac.uk>
Date: Tue, 15 Aug 2023 10:52:59 +0100
Subject: [PATCH 5/5] apply lint

---
 src/nlpsig/data_preparation.py | 16 ++++++----------
 tests/conftest.py              |  6 +++---
 tests/test_data_preparation.py | 34 ++++++++++++++++------------------
 tests/test_padding.py          |  4 ++--
 4 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/src/nlpsig/data_preparation.py b/src/nlpsig/data_preparation.py
index 3aeaa2a..9024fd9 100644
--- a/src/nlpsig/data_preparation.py
+++ b/src/nlpsig/data_preparation.py
@@ -328,9 +328,9 @@ def _check_feature_exists(self, feature: str) -> bool:
             # not in ._feature_list, but is a valid column name in self.df,
             # so add to feature list
             self._feature_list += [feature]
-            
+
         return feature in self._feature_list
-    
+
     def _obtain_feature_columns(
         self,
         features: list[str] | str | None,
@@ -369,8 +369,8 @@ def _obtain_feature_columns(
             # convert to list of strings
             if isinstance(features, str):
                 features = [features]
-            
-            if isinstance(features, list):    
+
+            if isinstance(features, list):
                 # check each item in features is in self._feature_list
                 # if it isn't, but is a column in self.df, it will add
                 # it to self._feature_list
@@ -777,9 +777,7 @@ def pad(
             raise ValueError("`method` must be either 'k_last' or 'max'.")
 
         # obtain feature colnames
-        feature_colnames = self._obtain_feature_columns(
-            features=features
-        )
+        feature_colnames = self._obtain_feature_columns(features=features)
         if len(feature_colnames) > 0:
             if isinstance(standardise_method, str):
                 standardise_method = [standardise_method] * len(feature_colnames)
@@ -881,9 +879,7 @@ def get_time_feature(
             (can be found in `._feature_list` attribute).
         """
         if time_feature not in self._feature_list:
-            raise ValueError(
-                f"`time_feature` should be in {self._feature_list}."
-            )
+            raise ValueError(f"`time_feature` should be in {self._feature_list}.")
 
         if not self.time_features_added:
             self.set_time_features()
diff --git a/tests/conftest.py b/tests/conftest.py
index 5d3b0cc..cda224c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -29,7 +29,7 @@ def test_df_with_datetime():
     return pd.DataFrame(
         {
             "text": [f"text_{i}" for i in range(n_entries)],
-            "binary_var": [rng.choice([0,1]) for i in range(n_entries)],
+            "binary_var": [rng.choice([0, 1]) for i in range(n_entries)],
             "continuous_var": rng.random(n_entries),
             "id_col": [0 for i in range(100)]
             + [rng.integers(1, 5) for i in range(n_entries - 100)],
@@ -45,7 +45,7 @@ def test_df_no_time():
     return pd.DataFrame(
         {
             "text": [f"text_{i}" for i in range(n_entries)],
-            "binary_var": [rng.choice([0,1]) for i in range(n_entries)],
+            "binary_var": [rng.choice([0, 1]) for i in range(n_entries)],
             "continuous_var": rng.random(n_entries),
             "id_col": [0 for i in range(100)]
             + [rng.integers(1, 5) for i in range(n_entries - 100)],
@@ -60,7 +60,7 @@ def test_df_to_pad():
     return pd.DataFrame(
         {
             "text": [f"text_{i}" for i in range(n_entries)],
-            "binary_var": [rng.choice([0,1]) for i in range(n_entries)],
+            "binary_var": [rng.choice([0, 1]) for i in range(n_entries)],
             "continuous_var": rng.random(n_entries),
             "id_col": 0,
             "label_col": [rng.integers(0, 4) for i in range(n_entries)],
diff --git a/tests/test_data_preparation.py b/tests/test_data_preparation.py
index 3f43c68..35d7746 100644
--- a/tests/test_data_preparation.py
+++ b/tests/test_data_preparation.py
@@ -28,11 +28,7 @@ def test_default_initialisation_datetime(
     # 1 dummy id column
     assert obj.df.shape == (
         len(obj.original_df.index),
-        1
-        + len(obj.original_df.columns)
-        + emb.shape[1]
-        + len(obj._feature_list)
-        + 1,
+        1 + len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list) + 1,
     )
     assert obj.pooled_embeddings is None
     assert set(obj._feature_list) == {
@@ -67,10 +63,7 @@ def test_default_initialisation_no_time(
     # 1 dummy id column
     assert obj.df.shape == (
         len(obj.original_df.index),
-        len(obj.original_df.columns)
-        + emb.shape[1]
-        + len(obj._feature_list)
-        + 1,
+        len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list) + 1,
     )
     assert obj.pooled_embeddings is None
     assert obj._feature_list == ["timeline_index"]
@@ -105,10 +98,7 @@ def test_initialisation_with_id_and_label_datetime(
     # 3 time features
     assert obj.df.shape == (
         len(obj.original_df.index),
-        1
-        + len(obj.original_df.columns)
-        + emb.shape[1]
-        + len(obj._feature_list),
+        1 + len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list),
     )
     assert obj.pooled_embeddings is None
     assert set(obj._feature_list) == {
@@ -516,7 +506,10 @@ def test_obtain_colnames_both(test_df_with_datetime, emb, emb_reduced):
     )
     assert obj._obtain_embedding_colnames(embeddings="full") == emb_names
     assert obj._obtain_embedding_colnames(embeddings="dim_reduced") == emb_reduced_names
-    assert obj._obtain_embedding_colnames(embeddings="both") == emb_reduced_names + emb_names
+    assert (
+        obj._obtain_embedding_colnames(embeddings="both")
+        == emb_reduced_names + emb_names
+    )
 
 
 def test_obtain_feature_columns_string(test_df_with_datetime, emb):
@@ -548,9 +541,11 @@ def test_obtain_feature_columns_string_additional_binary(test_df_with_datetime,
         "timeline_index",
         "binary_var",
     }
-    
 
-def test_obtain_feature_columns_string_additional_continuous(test_df_with_datetime, emb):
+
+def test_obtain_feature_columns_string_additional_continuous(
+    test_df_with_datetime, emb
+):
     # default initialisation
     obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb)
     # originally only have the time features
@@ -592,7 +587,9 @@ def test_obtain_feature_columns_list_additional(test_df_with_datetime, emb):
         "time_diff",
         "timeline_index",
     }
-    assert obj._obtain_feature_columns(["time_encoding", "timeline_index", "binary_var", "continuous_var"]) == [
+    assert obj._obtain_feature_columns(
+        ["time_encoding", "timeline_index", "binary_var", "continuous_var"]
+    ) == [
         "time_encoding",
         "timeline_index",
         "binary_var",
@@ -734,6 +731,7 @@ def test_standardise_pd_wrong_method(vec_to_standardise, test_df_no_time, emb):
     obj = PrepareData(original_df=test_df_no_time, embeddings=emb)
     incorrect_method = "fake_method"
     with pytest.raises(
-        ValueError, match=re.escape(f"`method`: {incorrect_method} must be in {implemented}.")
+        ValueError,
+        match=re.escape(f"`method`: {incorrect_method} must be in {implemented}."),
     ):
         obj._standardise_pd(vec=vec_to_standardise, method=incorrect_method)
diff --git a/tests/test_padding.py b/tests/test_padding.py
index deb7b0d..e2612a3 100644
--- a/tests/test_padding.py
+++ b/tests/test_padding.py
@@ -1250,7 +1250,7 @@ def test_pad_by_id_k_last_additional(test_df_with_datetime, emb):
     assert type(obj.array_padded) == np.ndarray
     assert np.array_equal(padded_array, obj.array_padded)
     assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol)
-    
+
 
 def test_pad_by_id_max(test_df_with_datetime, emb):
     obj = PrepareData(
@@ -1278,7 +1278,7 @@ def test_pad_by_id_max(test_df_with_datetime, emb):
     assert type(obj.array_padded) == np.ndarray
     assert np.array_equal(padded_array, obj.array_padded)
     assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol)
-    
+
 
 def test_pad_by_id_max_additional(test_df_with_datetime, emb):
     obj = PrepareData(