-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractors.py
76 lines (59 loc) · 1.54 KB
/
extractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from itertools import combinations
def normalize(word):
return word.translate(None,",<.>/?;:'\"").lower()
class Extractor:
def termsFromString(self,string):
return self.getTerms(normalize(word) for word in string.split())
def getTerms(self,iterator):
raise NotImplementedError()
def termsFromFile(self,file):
def gen(file):
for line in file:
for word in line.split():
yield normalize(word)
return self.getTerms(gen(file))
class NGramExtractor(Extractor):
def __init__(self,n):
self.n = n
def getTerms(self,words):
gram = tuple(next(words) for i in xrange(self.n))
while True:
yield gram
gram = gram[1:] + (next(words),)
class Nto1GramExtractor(Extractor):
def __init__(self,n):
self.n = n
def getTerms(self,words):
gram = ()
try:
for i in xrange(self.n):
gram = gram + (next(words),)
yield gram
while True:
gram = gram[1:]+(next(words),)
for i in xrange(self.n):
yield gram[:i+1]
except StopIteration:
for i in xrange(1,self.n):
yield gram[i:]
class SkipGramExtractor(Extractor):
def __init__(self,skip,n):
self.skip = skip
self.n = n
def getTerms(self,words):
gram = ()
try:
for i in xrange(self.n+self.skip):
gram = gram + (next(words),)
while True:
for end in combinations(gram[1:],self.n-1):
yield gram[:1]+end
gram = gram[1:] + (next(words),)
except StopIteration:
if len(gram) == self.n:
yield gram
else:
while len(gram) > self.n:
gram = gram[1:]
for end in combinations(gram[1:],self.n-1):
yield gram[:1]+end