-
Notifications
You must be signed in to change notification settings - Fork 0
/
Wordlist.py
executable file
·69 lines (63 loc) · 1.9 KB
/
Wordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os, re
from nltk.corpus import brown # Probably an unencumbered license
from nltk import FreqDist
fd = None
def load(filename):
"""Load wordlist file, return a list of words.
Wordlist file format:
...
[Start]
WORD/REGEX
WORD/REGEX
...
EOF
"""
if not os.path.exists(filename):
raise OSError(2, 'Error reading from file')
lst = []
f = open(filename)
#skip header
for line in f:
if line == '[Start]\n':
break
#get words
for line in f:
lst.append(line[:-1])
f.close()
return lst
def compile_lst(lst):
"""Compile a list of words into one regular expression."""
string = '\\b('+'|'.join(lst)+')\\b'
pattern = re.compile(string, re.VERBOSE | re.IGNORECASE)
return pattern
def stripAbbrevations(lst):
"""Strip abbrieviations from a list of words (ie. "WORD (WRD)" -> "WORD")\
, return a dictionary with key=abbreviation and value=full word."""
pattAbbrev = re.compile(r'\((.*)\)')
pattFull = re.compile(r'(.*)\(')
ret = {}
for i in range(len(lst)):
abbrev = re.findall(pattAbbrev, lst[i])
full = re.findall(pattFull, lst[i])
if abbrev:
abbrev=abbrev[0]
full=full[0].strip()
ret[abbrev] = full
lst[i] = full
return ret
def patternFind(pattern, text, uncommon_only=True):
"""Find all matches of a pattern in a text, \
optionally restricting matches to uncommon words only."""
text = text.lower()
matches = re.findall(pattern, text)
if uncommon_only:
global fd
if not fd:
words = brown.words()
fd = FreqDist(words)
matches = [m for m in matches if fd[m]<3] #3 is handwavy, but decent
return set(matches)
def listFind(lst, text):
"""Return the set of all elements of a list that are found in a given text."""
matches = [l for l in lst if l in text]
return set(matches)