-
Notifications
You must be signed in to change notification settings - Fork 0
/
txtcmp.py
46 lines (39 loc) · 1.15 KB
/
txtcmp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import argparse
import glob
import math
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def read_file(filename, method):
data = {}
if method == 'words':
with open(filename) as fp:
for line in fp:
words = line.rstrip().split()
for word in words:
data[word] = True
elif method == 'lines':
with open(filename) as fp:
for line in fp:
phrase = line.rstrip()
data[phrase] = True
else: sys.exit('wtf')
return ' '.join(data.keys())
def compare_text(text1, text2):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors)
distance = 1 - similarity[0][1]
if math.isclose(distance, 0, abs_tol=1e-6): return 0
return distance
## Command Line Interface ##
parser = argparse.ArgumentParser('text comparison program')
parser.add_argument('file')
parser.add_argument('dir')
parser.add_argument('data', help='[words|lines]')
arg = parser.parse_args()
a = read_file(arg.file, arg.data)
for file in glob.glob(f'{arg.dir}/*'):
b = read_file(file, arg.data)
d = compare_text(a, b)
print(arg.file, file, d)