-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsanitize.py
78 lines (68 loc) · 2.39 KB
/
sanitize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# sanitize.py
# sanitize wordlists
import argparse
from num2words import num2words
from word2number import w2n
import re
def convert_to_word_form(s):
pattern = re.compile(r'\d+|\D+')
new_s = ''
for match in pattern.finditer(s):
group = match.group()
if group.isdigit():
new_s += num2words(int(group), lang='en')
else:
new_s += group
return new_s.replace(' ', '')
# WIP
def convert_to_number_form(s):
tmp = ''
num = 0
for c in s:
tmp += c
try:
w2n.word_to_num(tmp)
num = w2n.word_to_num(tmp)
s = s.replace(tmp, str(num))
except:
if num != 0:
tmp = ''
continue
continue
return s
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--wordlist', help='wordlist to cleanse', required=True)
parser.add_argument('-n', '--numbers', help='number replacement mode', required=False, type=int)
# -c @!#%$^&*()_+|}{:?><~`-=[]\;',./
parser.add_argument('-c', '--chars', help='symbols to replace', required=False, type=str)
# -p http,xml,html,php,asp,aspx,js,css,sql,etc
parser.add_argument('-p', '--phrases', help='phrases to replace', required=False, type=str)
args = parser.parse_args()
if args.numbers:
number = int(args.numbers)
if number not in [1, 2]:
print("Invalid number mode.")
print("1 - Replace english with the number representation. ie; 'fifty' -> 50")
print("1 - Replace number with the english representation. ie; 50 -> 'fifty'")
new = open(f"{args.wordlist}-cleansed.txt", 'w')
with open(args.wordlist, 'r') as f:
for line in f:
line = line.strip().lower().replace(' ', '')
if args.numbers:
# int -> english (50 -> fifty)
if number==1:
line = convert_to_word_form(line)
# english -> int (fifty -> 50)
elif number==2:
line = convert_to_number_form(line)
if args.chars:
for c in args.chars:
line = line.replace(c, '')
if args.phrases:
if (len(args.phrases.split(', ')) > 0):
for phrase in args.phrases.split(', '):
line = line.replace(phrase, '')
else:
for phrase in args.phrases.split(','):
line = line.replace(phrase, '')
new.write(f"{line}\n")