-
Notifications
You must be signed in to change notification settings - Fork 0
/
Parser.py
55 lines (41 loc) · 1.54 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#http://tartarus.org/~martin/PorterStemmer/python.txt
from PorterStemmer import PorterStemmer
class Parser:
#A processor for removing the commoner morphological and inflexional endings from words in English
stemmer=None
stopwords=[]
def __init__(self,):
self.stemmer = PorterStemmer()
#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
self.stopwords = open('english.stop', 'r').read().split()
def clean(self, string):
""" remove any nasty grammar tokens from string """
string = string.replace(".","")
string = string.replace(",","")
string = string.replace(":","")
string = string.replace(":","")
string = string.replace("[","")
string = string.replace("]","")
string = string.replace(")","")
string = string.replace("(","")
string = string.replace("{","")
string = string.replace("}","")
string = string.replace("/","")
string = string.replace("?","")
string = string.replace("!","")
string = string.replace("’","")
string = string.replace("'","")
string = string.replace('"',"")
string = string.replace('”',"")
string = string.replace("\s+"," ")
string = string.replace('\n'," ")
string = string.lower()
return string
def removeStopWords(self,list):
""" Remove common words which have no search value """
return [word for word in list if word not in self.stopwords ]
def tokenise(self, string):
""" break string up into tokens and stem words """
string = self.clean(string)
words = string.split(" ")
return [self.stemmer.stem(word,0,len(word)-1) for word in words]