Skip to content

Commit

Permalink
Merge pull request #44 from datasig-ac-uk/verbose-option
Browse files Browse the repository at this point in the history
Fix #40: Add verbose option
  • Loading branch information
rchan26 authored Aug 15, 2023
2 parents ef6e51d + cd179e6 commit 4b163e2
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 96 deletions.
124 changes: 78 additions & 46 deletions src/nlpsig/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def __init__(
pooled_embeddings: np.array | None = None,
id_column: str | None = None,
label_column: str | None = None,
verbose: bool = True,
):
self.verbose = verbose

# perform checks that original_df have the right column names to work with
if embeddings.ndim != 2:
raise ValueError("`embeddings` should be a 2-dimensional array.")
Expand Down Expand Up @@ -79,17 +82,19 @@ def __init__(
# obtain modelling dataframe
self.df: pd.DataFrame | None = None
self.df = self._get_modeling_dataframe()

# set pooled embeddings if provided
if pooled_embeddings is not None:
if pooled_embeddings.ndim != 2:
raise ValueError(
"If provided, `pooled_embeddings` should be a 2-dimensional array."
)
if len(self.df[self.id_column].unique()) != pooled_embeddings.shape[0]:
print(
f"[INFO] `len(self.df[self.id_column].unique())`={len(self.df[self.id_column].unique())}"
f" and `pooled_embeddings.shape[0]`={pooled_embeddings.shape[0]}."
)
if self.verbose:
print(
f"[INFO] `len(self.df[self.id_column].unique())`={len(self.df[self.id_column].unique())}"
f" and `pooled_embeddings.shape[0]`={pooled_embeddings.shape[0]}."
)
raise ValueError(
"If provided, `pooled_embeddings` should have the same number "
"of rows as there are different ids in the id-column."
Expand Down Expand Up @@ -123,17 +128,21 @@ def _get_modeling_dataframe(self) -> pd.DataFrame:
if self.df is not None:
return self.df

print("[INFO] Concatenating the embeddings to the dataframe...")
print("[INFO] - columns beginning with 'e' denote the full embddings.")
if self.verbose:
print("[INFO] Concatenating the embeddings to the dataframe...")
print("[INFO] - columns beginning with 'e' denote the full embddings.")

embedding_df = pd.DataFrame(
self.embeddings,
columns=[f"e{i+1}" for i in range(self.embeddings.shape[1])],
)

if self.embeddings_reduced is not None:
print(
"[INFO] - columns beginning with 'd' denote the dimension reduced embeddings."
)
if self.verbose:
print(
"[INFO] - columns beginning with 'd' denote the dimension reduced embeddings."
)

embeddings_reduced_df = pd.DataFrame(
self.embeddings_reduced,
columns=[f"d{i+1}" for i in range(self.embeddings_reduced.shape[1])],
Expand All @@ -151,17 +160,21 @@ def _get_modeling_dataframe(self) -> pd.DataFrame:
[self.original_df.reset_index(drop=True), embedding_df],
axis=1,
)

if self.id_column is None:
self.id_column = "dummy_id"
print(
f"[INFO] No id_column was passed, so setting id_column to '{self.id_column}'."
)
if self.verbose:
print(
f"[INFO] No id_column was passed, so setting id_column to '{self.id_column}'."
)

if self.id_column not in self.original_df.columns:
if self.verbose:
print(
f"[INFO] There is no column in `.original_df` called '{self.id_column}'. "
f"Adding a new column named '{self.id_column}' of zeros."
)
# set default value to id_column
print(
f"[INFO] There is no column in `.original_df` called '{self.id_column}'. "
f"Adding a new column named '{self.id_column}' of zeros."
)
df[self.id_column] = 0

return df
Expand Down Expand Up @@ -203,17 +216,23 @@ def _set_time_features(self) -> pd.DataFrame:
Updated dataframe with time features.
"""
if self.time_features_added:
print("Time features have already been added.")
if self.verbose:
print("Time features have already been added.")
return None
print("[INFO] Adding time feature columns into dataframe in `.df`.")

if self.verbose:
print("[INFO] Adding time feature columns into dataframe in `.df`.")

if "datetime" in self.df.columns:
self._feature_list += ["time_encoding", "time_diff"]

# checking 'datetime' column is datatime type
self.df["datetime"] = pd.to_datetime(self.df["datetime"])

# obtain time encoding by computing the fraction of year it is in
print("[INFO] Adding 'time_encoding' feature...")
if self.verbose:
print("[INFO] Adding 'time_encoding' feature...")

self.df["time_encoding"] = self.df["datetime"].map(
lambda t: self._time_fraction(t)
)
Expand All @@ -224,7 +243,9 @@ def _set_time_features(self) -> pd.DataFrame:
self.df = self.df.sort_values(by=[self.id_column, "datetime"])

# calculate time difference between posts
print("[INFO] Adding 'time_diff' feature...")
if self.verbose:
print("[INFO] Adding 'time_diff' feature...")

self.df["time_diff"] = list(
self.df.groupby(self.id_column)
.apply(
Expand All @@ -240,18 +261,22 @@ def _set_time_features(self) -> pd.DataFrame:
.explode()
)
else:
print(
"[INFO] Note 'datetime' is not a column in `.df`, "
"so only 'timeline_index' is added."
)
print(
"[INFO] As 'datetime' is not a column in `.df`, "
"we assume that the data is ordered by time with respect to the id."
)
if self.verbose:
print(
"[INFO] Note 'datetime' is not a column in `.df`, "
"so only 'timeline_index' is added."
)
print(
"[INFO] As 'datetime' is not a column in `.df`, "
"we assume that the data is ordered by time with respect to the id."
)

# assign index for each post in each timeline
self._feature_list += ["timeline_index"]

print("[INFO] Adding 'timeline_index' feature...")
if self.verbose:
print("[INFO] Adding 'timeline_index' feature...")

self.df["timeline_index"] = list(
self.df.groupby(self.id_column)
.apply(lambda x: list(range(1, len(x) + 1)))
Expand Down Expand Up @@ -756,9 +781,11 @@ def pad(
dimension reduced embeddings, time features)
"""
print(
"[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes."
)
if self.verbose:
print(
"[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes."
)

if pad_by not in ["id", "history"]:
raise ValueError("`pad_by` must be either 'id' or 'history'.")

Expand Down Expand Up @@ -1017,11 +1044,13 @@ def get_torch_path_for_SWNUNetwork(
if include_embedding_in_input:
# repeat the embeddings which will be concatenated to the path later
if self.pad_method == "id":
print(
f"[INFO] The path was created for each {self.id_column} in the dataframe, "
"so to include embeddings in the FFN input, we concatenate the "
"pooled embeddings."
)
if self.verbose:
print(
f"[INFO] The path was created for each {self.id_column} in the dataframe, "
"so to include embeddings in the FFN input, we concatenate the "
"pooled embeddings."
)

if self.pooled_embeddings is None:
raise ValueError(
"There were no pooled embeddings passed into the class."
Expand All @@ -1035,11 +1064,13 @@ def get_torch_path_for_SWNUNetwork(
)
emb = torch.from_numpy(self.pooled_embeddings.astype("float")).float()
elif self.pad_method == "history":
print(
"[INFO] The path was created for each item in the dataframe, "
"by looking at its history, so to include embeddings in the FFN input, "
"we concatenate the embeddings for each sentence / text."
)
if self.verbose:
print(
"[INFO] The path was created for each item in the dataframe, "
"by looking at its history, so to include embeddings in the FFN input, "
"we concatenate the embeddings for each sentence / text."
)

if reduced_embeddings:
if self.embeddings_reduced is None:
raise ValueError(
Expand Down Expand Up @@ -1148,10 +1179,11 @@ def check_history_length_for_SeqSigNet(
required_history_length = shift * n + (window_size - shift)
if self.array_padded.shape[1] != required_history_length:
# required history length not met
print(
f"A history length of size {required_history_length} is required, "
f"but we have history length size of {self.array_padded.shape[1]}"
)
if self.verbose:
print(
f"A history length of size {required_history_length} is required, "
f"but we have history length size of {self.array_padded.shape[1]}"
)
return False

# we have the required history length
Expand Down
Loading

0 comments on commit 4b163e2

Please sign in to comment.