-
Notifications
You must be signed in to change notification settings - Fork 3
/
intersect-vocabs.py
34 lines (29 loc) · 1.32 KB
/
intersect-vocabs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import re
import time
import io
import sys
import argparse
from collections import defaultdict
import gzip
# parse/validate arguments
argparser = argparse.ArgumentParser()
argparser.add_argument("-i", "--input_vocabs", help=
" Comma-separated list of text files.")
argparser.add_argument("-o", "--output_vocab", help=
" Output vocab file, one word per line, which only includes words which appear in each of the input files.")
args = argparser.parse_args()
vocabs = []
input_vocab_filenames = args.input_vocabs.split(',')
for input_vocab_filename in input_vocab_filenames:
with gzip.open(input_vocab_filename, mode='r') if input_vocab_filename.endswith('.gz') else open(input_vocab_filename, mode='r') as input_vocab:
vocab = set(input_vocab.read().split())
print '{} unique words found in {}'.format(len(vocab), input_vocab_filename)
vocabs.append(vocab)
# find intersection
intersection = vocabs[0]
for i in xrange(1, len(vocabs)):
intersection &= vocabs[i]
# write to output file
with gzip.open(args.output_vocab, mode='w') if args.output_vocab.endswith('.gz') else open(args.output_vocab, mode='w') as output_vocab_file:
output_vocab_file.write('\n'.join(intersection))
print '{} unique words found in the intersection. written to {}'.format(len(intersection), args.output_vocab)