-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathNLP.py
76 lines (63 loc) · 2.05 KB
/
NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Importing the libraries
import numpy as np
import pandas as pd
import nltk
import re
import pickle
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')
reviews=load_files(os.path.join(BASE_DIR, 'txt_sentoken')
X,y=reviews.data,reviews.target
#nltk==natural lamguage tool kit
#stopwords==is,a,an,the
#bag of words model
#NLTK PREPROCESSING
'''
re=regular expression
re.sub(expression,new,var)
\W return a match where there is non-string char
\s return when there is white space char
lower==converts each char in lower
^matches beginning of string
'''
corpus=[]
for i in range(len(X)):
review=re.sub(r'\W',' ',str(X[i]))
review=review.lower()
review=re.sub(r'\s+[a-z]\s+',' ',review)
review=re.sub(r'^[a-z]\s+',' ',review)
review=re.sub(r's+',' ',review)
corpus.append(review)
#w,r,w+,r+
with open('X.pickle','wb') as f:
pickle.dump(X,f)
with open('y.pickle','wb') as f:
pickle.dump(y,f)
with open('X.pickle','rb') as f:
X=pickle.load(f)
with open('y.pickle','rb') as f:
y=pickle.load(f)
#count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer(max_features=len(X),min_df=3,max_df=0.6,stop_words=stopwords.words('English'))
x=count.fit_transform(corpus).toarray()
from sklearn.feature_extraction.text import TfidfVectorizer
transformer=TfidfVectorizer()
X=transformer.fit_transform(corpus)
X=X.toarray()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf=clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
with open('Classfier.pickle','wb') as f:
pickle.dump(clf,f)
with open('TFidf.pickle','wb') as f:
pickle.dump(count,f)
#CONFUSION MATRIX
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_pred,y_test)
sample=['this is a good guy,have a great life']
#WORKING CODE