-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwinnowing.py
89 lines (71 loc) · 2.46 KB
/
winnowing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 1 23:02:27 2018
@author: Cherish
"""
import re, os
def preprocessing(filename):
with open(filename) as f:
content = f.read()
pattern = re.compile(r'[\s+] | [^a-zA-Z]') #去除所有空白字符或者标点符号
content = pattern.sub('', content)
content = content.lower()
return content
def gengerate_n_gram(string, n):
n_gram = []
for i in range(len(string) - n + 1):
n_gram.append(string[i:i + n])
return n_gram
def calculate_hashing_set(n_gram, Base, n):
hashinglist = []
hash = 0
first_gram = n_gram[0]
#单独计算第一个n_gram的哈希值
for i in range(n):
hash += ord(first_gram[i]) * (Base**(n - i - 1))
hashinglist.append(hash)
for i in range(1, len(n_gram)):
pre_gram = n_gram[i-1]
this_gram = n_gram[i]
hash = (hash - ord(pre_gram[0])*(Base**(n-1)))*Base + ord(this_gram[n - 1])
hashinglist.append(hash)
return hashinglist
#核心函数,计算一篇文章哈希值的数据摘要,算法为winnowing
def winnowing(hashinglist, t, n):
window = t - n + 1
min_val = 0
min_index = 0
fingerprint = {}
for i in range(len(hashinglist) - window + 1):
temp = hashinglist[i:i + window]
min_val = temp[0]
min_index = 0
for j in range(window):
if temp[j] <= min_val:
min_val = temp[j]
min_index = j
if (i + min_index) not in fingerprint.keys():
fingerprint[i + min_index] = min_val
return fingerprint
#比较两个文档的相似性
def comparison(fingerprint_1, fingerprint_2):
count = 0
size = min(len(fingerprint_1), len(fingerprint_2))
for i in fingerprint_1.values():
for j in fingerprint_2.values():
if i == j:
count += 1
break
return count / size
if __name__ == '__main__':
dirpath = os.getcwd()
n = 5
t = 9
Base = 17
print('n = 5, t = 9' )
path_1 = dirpath + '\\text1.txt'
path_2 = dirpath + '\\text2.txt'
fingerprint_1 = winnowing(calculate_hashing_set(gengerate_n_gram(preprocessing(path_1), n), Base, n), t, n)
fingerprint_2 = winnowing(calculate_hashing_set(gengerate_n_gram(preprocessing(path_2), n), Base, n), t, n)
similiarize = comparison(fingerprint_1, fingerprint_2)
print('两篇文章的相似度为{:5f}'.format(similiarize))