-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
341 lines (296 loc) · 15.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# Standard Libraries
import os # Interact with the operating system
import logging # Logging library
from transformers.utils import logging as transformers_logging
import warnings # Warning control
import numpy as np # Numerical operations
import pandas as pd # Data manipulation and analysis
import nltk # Natural Language Toolkit
import spacy # NLP library
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import email # Email handling
import email.policy # Email policies
from imblearn.over_sampling import SMOTE # Handling imbalanced data
import tensorflow as tf # TensorFlow library
from bs4 import MarkupResemblesLocatorWarning # HTML and XML parsing
from datasets import load_dataset # Load datasets\
from src.spamandphishingdetection import (
initialize_environment,
load_config,
get_file_paths,
get_model_path,
get_params_path,
DatasetProcessor,
log_label_percentages,
check_missing_values,
feature_engineering,
load_or_save_emails,
merge_dataframes,
verify_merged_dataframe,
combine_dataframes,
verify_combined_dataframe,
save_combined_dataframe,
load_or_clean_data,
data_cleaning,
save_dataframe_to_csv,
combine_columns_for_cleaning,
generate_noisy_dataframe,
stratified_k_fold_split,
BERTFeatureExtractor,
BERTFeatureTransformer,
RareCategoryRemover,
run_pipeline_or_load,
main_model_training,
plot_learning_curve
)
# Main processing function
def main():
nlp, loss_fn = initialize_environment(__file__)
config = load_config("config/config.json")
file_paths = get_file_paths(config)
# Load the datasets
df_ceas = pd.read_csv(
file_paths['ceas_08_dataset'], sep=',', encoding='utf-8')
dataset = load_dataset('talby/spamassassin',
split='train', trust_remote_code=True)
df_spamassassin = dataset.to_pandas()
try:
# ****************************** #
# Data Preprocessing #
# ****************************** #
logging.info(f"Beginning Data Preprocessing...")
# Change label values to match the labeling scheme
df_spamassassin['label'] = df_spamassassin['label'].map({1: 0, 0: 1})
processor_spamassassin = DatasetProcessor(
df_spamassassin, 'text', 'spam_assassin', file_paths['preprocessed_spam_assassin_file'])
df_processed_spamassassin = processor_spamassassin.process_dataset()
processor_ceas = DatasetProcessor(
df_ceas, 'body', 'ceas_08', file_paths['preprocessed_ceas_file'])
df_processed_ceas = processor_ceas.process_dataset()
# Combined DataFrame
combined_percentage_df = pd.concat(
[df_processed_spamassassin, df_processed_ceas])
log_label_percentages(df_processed_ceas, 'CEAS_08')
log_label_percentages(df_processed_spamassassin, 'SpamAssassin')
log_label_percentages(
combined_percentage_df, 'Combined CEAS_08 and SpamAssassin (No Processing)')
check_missing_values(combined_percentage_df,
'Combined CEAS_08 and SpamAssassin (No Processing)')
logging.info(f"Data Preprocessing completed.\n")
# Columns in CEAS_08 dataset: ['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls']
# Columns in SpamAssassin dataset: ['label', 'group', 'text']
# ****************************** #
# Feature Engineering #
# ****************************** #
logging.info(f"Beginning Feature Engineering...")
spamassassin_headers_df, ceas_headers_df = feature_engineering(
df_processed_spamassassin, df_processed_ceas, file_paths)
# ************************* #
# Data Cleaning #
# ************************* #
logging.info(
f"Beginning Data Cleaning of CEAS_08 ['sender', 'receiver']...")
df_cleaned_ceas_headers = load_or_save_emails(
df_processed_ceas, file_paths['cleaned_ceas_headers'])
logging.info(
f"Beginning merging of Cleaned Headers of CEAS_08 with Processed CEAS_08...")
if len(df_cleaned_ceas_headers) != len(df_processed_ceas):
logging.error(
"The number of rows in the Merged Cleaned Headers of CEAS_08 DataFrame does not match Processed CEAS_08.")
raise ValueError(
"The number of rows in the Merged Cleaned Headers of CEAS_08 DataFrame does not match Processed CEAS_08.")
else:
df_processed_ceas.drop(
columns=['sender', 'receiver'], inplace=True)
# Corrected logging statements below
logging.info(
f"Columns in df_cleaned_ceas_headers: {df_cleaned_ceas_headers.columns.tolist()}")
logging.info(
f"Columns in df_processed_ceas: {df_processed_ceas.columns.tolist()}")
df_cleaned_ceas_headers_merge = pd.concat(
[df_cleaned_ceas_headers.reset_index(drop=True),
df_processed_ceas.reset_index(drop=True)], axis=1)
missing_in_cleaned_ceas_header_merged = df_cleaned_ceas_headers_merge[
(df_cleaned_ceas_headers_merge['sender'].isnull()) |
(df_cleaned_ceas_headers_merge['receiver'].isnull())]
logging.info(
f"Number of missing rows in Merged Cleaned Headers of CEAS_08 DataFrame: {len(missing_in_cleaned_ceas_header_merged)}")
logging.info(
f'Total rows in Processed CEAS_08 Dataframe: {len(df_processed_ceas)}')
logging.info(
f"Total rows in Merged Cleaned Headers of CEAS_08 Dataframe: {len(df_cleaned_ceas_headers_merge)}")
if len(df_cleaned_ceas_headers_merge) != len(df_processed_ceas):
logging.error(
"The number of rows in the Merged Cleaned Headers of CEAS_08 DataFrame does not match Processed CEAS_08.")
raise ValueError(
"The number of rows in the Merged Cleaned Headers of CEAS_08 DataFrame does not match Processed CEAS_08.\n")
else:
logging.info(
"The number of rows in the Merged Cleaned Headers of CEAS_08 DataFrame matches Processed CEAS_08.")
df_cleaned_ceas_headers_merge.to_csv(
file_paths['merged_cleaned_ceas_headers'], index=False)
logging.info(
f"Merged Cleaned Headers of CEAS_08 DataFrame successfully saved to {file_paths['merged_cleaned_ceas_headers']}")
logging.info(
f"Data Cleaning of CEAS_08 ['sender', 'receiver'] completed.\n")
# ****************************** #
# Data Integration #
# ****************************** #
logging.info(f"Beginning Data Integration...")
# Merging Processed SpamAssassin dataset with the extracted information
logging.info(
f"Merging Processed Spam Assassin and Spam Assassin Header Dataframes...")
merged_spamassassin_df = merge_dataframes(
df_processed_spamassassin, spamassassin_headers_df, on_column='index',
rename_columns={'text': 'body'},
select_columns=['sender', 'receiver', 'https_count', 'http_count', 'blacklisted_keywords_count',
'short_urls', 'has_ip_address', 'urls', 'body', 'label', 'index']
)
verify_merged_dataframe(merged_spamassassin_df, df_processed_spamassassin,
'Spam Assassin', file_paths['merged_spam_assassin_file'])
# Merge Processed CEAS_08 dataset with the extracted information
logging.info(
f"Merging Processed CEAS_08 and CEAS_08 Header Dataframes...")
merged_ceas_df = merge_dataframes(
df_cleaned_ceas_headers_merge, ceas_headers_df, on_column='index',
select_columns=['sender', 'receiver', 'https_count', 'http_count', 'blacklisted_keywords_count',
'short_urls', 'has_ip_address', 'urls', 'body', 'label', 'index']
)
verify_merged_dataframe(
merged_ceas_df, df_cleaned_ceas_headers_merge, 'CEAS_08', file_paths['merged_ceas_file'])
# Merge Spam Assassin and CEAS_08 datasets
logging.info(f"Merging Spam Assassin and CEAS_08 Dataframes...")
common_columns = ['sender', 'receiver', 'https_count', 'http_count',
'blacklisted_keywords_count', 'short_urls', 'has_ip_address', 'urls', 'body', 'label']
combined_df = combine_dataframes(
merged_spamassassin_df, merged_ceas_df, common_columns)
verify_combined_dataframe(combined_df, combined_percentage_df)
save_combined_dataframe(combined_df, file_paths['merged_data_frame'])
logging.info(f"Data Integration completed.\n")
# ************************* #
# Data Cleaning #
# ************************* #
logging.info(f"Beginning Data Cleaning ['body']...")
df_clean_body = load_or_clean_data(
'Merged Dataframe', combined_df, 'body', "output/main_model_evaluation/data_cleaning/cleaned_data_frame.csv", data_cleaning)
# Verifying the Cleaned Combine DataFrame
# Concatenate the Cleaned DataFrame with the Merged DataFrame
df_cleaned_combined = combine_columns_for_cleaning(
combined_df, df_clean_body)
verify_combined_dataframe(combined_df, df_cleaned_combined)
# Save the cleaned combined DataFrame to CSV
save_dataframe_to_csv(df_cleaned_combined,
file_paths['merged_cleaned_data_frame'])
# ***************************** #
# Noise Injection #
# ***************************** #
logging.info(f"Beginning Noise Injection...")
noisy_df = generate_noisy_dataframe(
df_cleaned_combined, 'output/main_model_evaluation/noise_injection/noisy_data_frame.csv')
logging.info(f"Noise Injection completed.\n")
# ************************* #
# Data Splitting #
# ************************* #
logging.info(f"Beginning Data Splitting...")
folds = stratified_k_fold_split(noisy_df)
logging.info(f"Data Splitting completed.\n")
# Initialize lists to store accuracies for each fold
fold_train_accuracies = []
fold_test_accuracies = []
learning_curve_data = []
for fold_idx, (X_train, X_test, y_train, y_test) in enumerate(folds, start=1):
# ************************************************************ #
# Feature Extraction and Data Imbalance Handling #
# ************************************************************ #
logging.info(
f"Beginning Feature Extraction for Fold {fold_idx}...")
# Define columns for categorical, numerical, and text data
categorical_columns = ['sender', 'receiver']
numerical_columns = ['https_count', 'http_count',
'blacklisted_keywords_count', 'urls', 'short_urls', 'has_ip_address']
text_column = 'cleaned_text'
# Initialize BERT feature extractor and transformer
bert_extractor = BERTFeatureExtractor()
bert_transformer = BERTFeatureTransformer(
feature_extractor=bert_extractor)
# Define preprocessor for categorical and numerical columns
preprocessor = ColumnTransformer(
transformers=[
('cat', Pipeline([
('rare_cat_remover', RareCategoryRemover(
threshold=0.05)), # Remove rare categories
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(
sparse_output=False, handle_unknown='ignore'))
]), categorical_columns),
('num', Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
]), numerical_columns)
],
remainder='passthrough' # Keep other columns unchanged, like 'cleaned_text' and 'label'
)
# Define pipeline with preprocessor, BERT, and SMOTE
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('bert_features', bert_transformer),
('smote', SMOTE(random_state=42))
])
# Call the function to either run the pipeline or load preprocessed data
X_train_balanced, X_test_combined, y_train_balanced, y_test = run_pipeline_or_load(
fold_idx=fold_idx,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test,
pipeline=pipeline,
dir='output/main_model_evaluation/feature_extraction',
)
logging.info(
f"Data for Fold {fold_idx} has been processed or loaded successfully.\n")
# ***************************************** #
# Model Training and Evaluation #
# ***************************************** #
logging.info(
f"Beginning Model Training and Evaluation for Fold {fold_idx}...")
# Train the model and evaluate the performance for each fold
model_path = get_model_path(config, fold_idx)
params_path = get_params_path(config, fold_idx)
ensemble_model, test_accuracy = main_model_training(
X_train_balanced,
y_train_balanced,
X_test_combined,
y_test,
model_path=model_path,
params_path=params_path,
)
fold_test_accuracies.append(test_accuracy)
logging.info(
f"Data for Fold {fold_idx} has been processed, model trained, and evaluated.\n")
# Store learning curve data for later plotting
learning_curve_data.append(
(X_train_balanced, y_train_balanced, ensemble_model, fold_idx))
# ********************************* #
# Plot Learning Curves #
# ********************************* #
for X_train, y_train, ensemble_model, fold_idx in learning_curve_data:
plot_learning_curve(
estimator=ensemble_model,
X=X_train,
y=y_train,
title=f"Learning Curve for Fold {fold_idx}",
train_sizes=np.linspace(0.1, 1.0, 6),
cv=6
)
logging.info(f"Training and evaluation completed for all folds.\n")
# Calculate and log the overall test accuracy
mean_test_accuracy = np.mean(fold_test_accuracies)
logging.info(f"Overall Test Accuracy: {mean_test_accuracy * 100:.2f}%")
except Exception as e:
logging.error(f"An error occurred: {e}")
return
if __name__ == "__main__":
main()