-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
74 lines (67 loc) · 2.97 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def load_data():
# Load true news data from both 2020 ~ 2024
true_df_2020 = pd.read_csv('./data/True_2020.csv')[['title', 'text']]
true_df_2022 = pd.read_csv('./data/True_2022.csv')[['title', 'text']]
true_df_2024 = pd.read_csv('./data/True_2024.csv')[['title', 'text']]
true_df = pd.concat([true_df_2020, true_df_2022, true_df_2024], axis=0).reset_index(drop=True)
# Load fake news data from both 2020 ~ 2024
fake_df_2020 = pd.read_csv('./data/Fake_2020.csv')[['title', 'text']]
fake_df_2022 = pd.read_csv('./data/Fake_2022.csv')[['title', 'text']]
fake_df_2024 = pd.read_csv('./data/Fake_2024.csv')[['title', 'text']]
fake_df = pd.concat([fake_df_2020, fake_df_2022, fake_df_2024], axis=0).reset_index(drop=True)
# Label the data
true_df['target'] = 0
fake_df['target'] = 1
# Load additional dataset
add_df = pd.read_csv('./data/WELFake.csv')[['title', 'text', 'label']]
add_df = add_df.rename(columns={'label': 'target'})
# Combine all datasets and Remove duplicates
df = pd.concat([true_df, fake_df, add_df], axis=0).reset_index(drop=True)
df = df.drop_duplicates(subset=['title', 'text'])
return df
def plot_distribution(df):
df.target.value_counts(normalize=True).plot(kind='bar')
plt.title('Target Distribution')
plt.xlabel('Target')
plt.ylabel('Frequency')
plt.show()
def text_preprocessing(text):
# Check if the input is a string; if not, convert it to an empty string
if not isinstance(text, str):
text = ''
text = text.lower()
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'https?://\S+|www\.\S+', '', text)
text = re.sub(r'<.*?>+', '', text)
text = re.sub(r'[%s]' % re.escape(string.punctuation + "–—−±×÷"), '', text)
text = re.sub(r'\n', '', text)
text = re.sub(r'\w*\d\w*', '', text)
text = re.sub(r'reuters', '', text)
text = re.sub(r' +', ' ', text).strip()
return text
def preprocess_data(df):
df['title'] = df['title'].apply(text_preprocessing)
df['text'] = df['text'].apply(text_preprocessing)
df = df.sample(frac=1).reset_index(drop=True)
return df
def split_and_save(df):
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])
train_df.to_csv('./data/train.csv', index=False)
test_df.to_csv('./data/test.csv', index=False)
print("Train dataset size : ", len(train_df))
print("Test dataset size : ", len(test_df))
if __name__ == "__main__":
df = load_data()
print("News dataset size : ", len(df), end= "\n")
plot_distribution(df)
df = preprocess_data(df)
print(df.head(), end = "\n\n")
split_and_save(df)
print("News data preprocessing is done.")