forked from zhuli8805/CBT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessor.py
75 lines (65 loc) · 2.32 KB
/
Preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 1 11:08:32 2016
@author: ZHULI
"""
import re
from nltk.stem import PorterStemmer
from read_npl_tag import get_nlp_annotate, get_all_nlp_tag
class Preprocessor():
def __init__(self, isReversed = False, isStop = False, isStem = False):
self.isReversed = isReversed
self._isStem = isStem
self._stops = set()
if isStop:
self.readStopList()
def readStopList(self, file = 'stop_list.txt'):
f = open(file,'r')
for line in f:
self._stops.add(line.strip())
def getLine(self, line):
res = ''
for token in self.getToken(line):
res = res + ' ' + token
return res
def getToken(self, line):
regex_word = re.compile('\w+')
words = regex_word.findall(line)
if self.isReversed:
words.reverse()
for token in words:
if token in ['START', 'END']:
yield token
if token not in self._stops:
if self._isStem:
token = PorterStemmer().stem(token).lower()
yield token
else:
token = token.lower()
yield token
def getWord(self, word):
if word in ['START', 'END']:
return word
if word not in self._stops:
if self._isStem:
word = PorterStemmer().stem(word).lower()
return word
else:
return None
class Preprocessor_WP(Preprocessor):
def __init__(self, isSimplePOS, isReversed = False, isStop = False, isStem = False):
Preprocessor.__init__(self, isReversed = isReversed, isStop = isStop, isStem = isStem)
def getToken_wordpos(self, line):
wordpos_pairs = get_all_nlp_tag(line, True)
if self.isReversed:
wordpos_pairs.reverse()
for wordpos in wordpos_pairs:
word, pos = wordpos
if word not in ['START', 'END']:
word = word.lower()
yield self.getWord(word), pos
# get a list of pos tags of the words in the line
def getPOS_line(self, line):
return get_all_nlp_tag(line)
def getSimplePOS(self, POS):
pass