-
Notifications
You must be signed in to change notification settings - Fork 0
/
Email Spam Detection.py
100 lines (47 loc) · 1.27 KB
/
Email Spam Detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
df = pd.read_csv("spam.csv")
df.head()
# In[2]:
df.groupby('Category').describe()
# In[3]:
df["spam"] = df["Category"].apply(lambda x: 1 if x =="spam" else 0)
df.head()
# In[5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.25)
len(X_train)
# In[6]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]
# In[7]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count, y_train)
# In[8]:
emails = [
'Hey mohan, can we get together to watch footbal game tomorrow?',
'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)
# In[9]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)
# In[10]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
("Vectorizer", CountVectorizer()),
("nb", MultinomialNB())
])
# In[11]:
clf.fit(X_train, y_train)
# In[12]:
clf.score(X_test, y_test)
# In[13]:
clf.predict(emails)
# In[ ]: