-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathTraining_Coarse.py
108 lines (91 loc) · 2.84 KB
/
Training_Coarse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tag.stanford import NERTagger
from scipy.sparse import hstack
from sklearn.svm import LinearSVC
from practnlptools.tools import Annotator
from readproperties import read_property
##removing special characters from sentence##
def preprocess(raw_sentence):
sentence= re.sub(r'[$|.|!|"|(|)|,|;|`|\']',r'',raw_sentence)
return sentence
##making the file format ready to use##
def file_preprocess(filename):
corpus=[]
classes=[]
f=open(filename,'r')
fi=open(read_property('word_features_train_coarse_path'),"w")
lines=f.readlines()
for line in lines:
line=line.rstrip('\n')
line=preprocess(line)
#print "The line is ",line
sentence=""
words=line.split()
for i in range(0,len(words)):
if not(i==0):
sentence=sentence+(words[i])+" "
fi.write(sentence+"\n")
corpus.append(sentence)
f.close()
fi.close()
return corpus,classes
##Compute POS##
def compute_POS_Tags(corpus):
#POS=[]
fi=open(read_property('POS_features_train_coarse_path'),"w")
for sentence in corpus:
text = nltk.word_tokenize(sentence)
pos_seq=nltk.pos_tag(text)
#print pos_seq
pos_tags=""
for pos in pos_seq:
pos_tags=pos_tags+pos[1]+" "
fi.write(pos_tags+"\n")
#print pos_tags
#POS.append(pos_tags)
#print "The bag of words of POS is ",POS
fi.close()
#return POS
##Compute NER##
def compute_NER(corpus):
#NER=[]
fi=open(read_property('NER_features_train_coarse_path'),"w")
st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath'))
for sentence in corpus:
ner=st.tag(sentence.split())
#print ner
#pos_seq=nltk.pos_tag(text)
#print pos_seq
ner_tag=""
for n in ner:
#print n[1]
ner_tag=ner_tag+n[1]+" "
#print pos_tags
fi.write(ner_tag+"\n")
#NER.append(ner_tag)
#print "The bag of words of NER is ",NER
fi.close()
#return NER
##Compute Chunks##
def compute_Chunks(corpus):
#Chunk_Tags=[]
fi=open(read_property('Chunk_features_train_path'),"w")
annotator=Annotator()
for sentence in corpus:
chunks=annotator.getAnnotations(sentence)['chunk']
chunk=""
for elem in chunks:
chunk=chunk+elem[1]+" "
#print chunk
fi.write(chunk+"\n")
#Chunk_Tags.append(chunk)
#print "The bag of words for Chunks is ",Chunk_Tags
fi.close()
#return Chunk_Tags
filename_train=read_property('trainingfilepath')
corpus,train_class=file_preprocess(filename_train)
compute_POS_Tags(corpus)
compute_NER(corpus)
compute_Chunks(corpus)