forked from RasaHQ/rasa_lookup_demo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_lookup.py
67 lines (49 loc) · 1.74 KB
/
filter_lookup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import csv
import sys
"""
This script takes a list of startups and removes
any startups that have names that are also scrabble words
"""
def open_data(filename):
out_list = []
print('loading data from {}'.format(filename))
with open(filename, 'rt') as f:
reader = csv.reader(f)
for row in reader:
out_list += row
print('found {} elements'.format(len(out_list)))
return out_list
def write_data(filename, filtered_startups):
print('writing that to file at : {}'.format(filename))
with open(filename, "w") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerows([filtered_startups])
def filter_list(in_list, scrabble_list):
filtered_list = set()
for i, s in enumerate(in_list):
if i % 100 == 0:
print('percent done: {} % \t elements removed: {}'.format(int(1000*i/len(in_list))/10, i - len(filtered_list)))
if s.lower() not in scrabble_list:
if any([word.lower() not in scrabble_list for word in s]):
filtered_list.add(s)
filtered_list = list(filtered_list)
print('now have {} elements, removed {}'.format(len(filtered_list),len(in_list)-len(filtered_list)))
return filtered_list
def parse_cmi():
argv = sys.argv
#defaults
read_file = 'data/company/startups.csv'
scrabble_file = 'data/company/english_scrabble.txt'
write_file = 'data/company/startups_filtered.csv'
out_files = [read_file, scrabble_file, write_file]
# read in command line args
if len(argv) > 1:
for i, filename in enumerate(argv):
out_files[i] = filename
return tuple(out_files)
if __name__ == '__main__':
read_file, scrabble_file, write_file = parse_cmi()
in_list = open_data(read_file)
scrabble_list = open_data(scrabble_file)
filtered_list = filter_list(in_list, scrabble_list)
write_data(write_file, filtered_list)