-
Notifications
You must be signed in to change notification settings - Fork 1
/
minhash.py
152 lines (121 loc) · 4.93 KB
/
minhash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import sys
import random
import binascii
import time
HASH_NUM = 1000
# 32 byte hash
HASH_SIZE = 32
MAX_SHINGLE_ID = 2**HASH_SIZE-1
# PRIME is the next prime greater than MAX_SHINGLE_ID
def next_prime(hsize:int):
"""
Minhash relies on the next prime greater than the MAX_SHINGLE_ID (calculated as 2**HASH_SIZE-1).
At the moment, HASH_SIZE is set to 32 bytes -- in the future, a user may want to supply their
own hash_size and this function provides known prime numbers for common hash sizes (8,16,32).
"""
book = {
'8': 257,
'16': 65537,
'32': 4294967311,
}
if str(hsize) in book.keys():
return book[str(hsize)]
else:
raise NotImplementedError("only support for 8-,16-, and 32-byte hashes included")
PRIME = next_prime(HASH_SIZE)
SHINGLE_SIZE = 3
# SHINGLE_TYPE = 3
# SHINGLE_TYPE is 'word' or 'char'
SHINGLE_TYPE = 'word'
def HASH_FUNC(x):
return binascii.crc32(x.encode('utf-8')) & 0xffffffff
# TODO: should probably use the logger module instead of verbose
def show_hash(x:int, hash_size=HASH_SIZE, strict=False, verbose=True):
showable = ("{" + "0:0{}b".format(hash_size) + "}").format(x)
if hash_size < len(showable):
msg = "{} overflows a hash size of {}. Found size {}".format(x, hash_size, len(showable))
if strict:
raise RuntimeError(msg)
elif verbose:
print("[WARNING] {}".format(msg))
else:
pass
return showable
def calculate(
s1, s2, coeffs_a=None, coeffs_b=None, total_hash_num=HASH_NUM,
max_shingle_id=MAX_SHINGLE_ID, shingle_size=SHINGLE_SIZE,
shingle_type=SHINGLE_TYPE, hash_func=HASH_FUNC, prime=PRIME
):
if type(s1) == str:
# if string, turn to shingles
shingles1 = str_to_shingles(s1, shingle_size=shingle_size, shingle_type=shingle_type)
shingles2 = str_to_shingles(s2, shingle_size=shingle_size, shingle_type=shingle_type)
elif type(s1) == list or type(s1) == set:
# HACK: assuming shingles
shingles1 = s1
shingles2 = s2
if not coeffs_a:
coeffs_a = generate_coefficients(total_hash_num=total_hash_num, max_shingle_id=max_shingle_id)
coeffs_b = generate_coefficients(total_hash_num=total_hash_num, max_shingle_id=max_shingle_id)
sigs_a = get_min_signatures(shingles1, coeffs_a, coeffs_b, total_hash_num=total_hash_num, hash_func=hash_func)
sigs_b = get_min_signatures(shingles2, coeffs_a, coeffs_b, total_hash_num=total_hash_num, hash_func=hash_func)
union_count = 0
for i, val in enumerate(sigs_a):
if sigs_b[i] == val:
union_count += 1
return union_count / float(total_hash_num)
def get_min_signatures(shingles, coeffs_a, coeffs_b, total_hash_num=HASH_NUM, hash_func=HASH_FUNC, prime=PRIME):
min_signatures = list()
hash_count = 0
while hash_count < total_hash_num:
min_hash = prime + 1
for shingle in shingles:
# hash function is (a*x + b) % c
# Where 'x' is the input value, 'a' and 'b' are random coefficients, and 'c' is our prime num
current_hash = (coeffs_a[hash_count] * hash_func(shingle) + coeffs_b[hash_count]) % prime
if current_hash < min_hash:
min_hash = current_hash
min_signatures.append(min_hash)
hash_count += 1
return min_signatures
def str_to_shingles(string, shingle_size=SHINGLE_SIZE, shingle_type=SHINGLE_TYPE):
shingles_in_doc = set()
if shingle_type == 'word':
units = string.split(' ')
elif shingle_type == 'char':
units = list(string)
for idx in range(0, len(units) - (shingle_size - 1)):
if shingle_type == 'word':
shingle = ' '.join(units[idx:idx+shingle_size])
elif shingle_type == 'char':
shingle = ''.join(units[idx:idx+shingle_size])
shingles_in_doc.add(shingle)
return list(shingles_in_doc)
def generate_coefficients(total_hash_num=HASH_NUM, max_shingle_id=MAX_SHINGLE_ID):
# create a unique set of 'HASH_NUM' random values
rand_set = set()
hash_num = 0
while hash_num < total_hash_num:
rand_num = random.randint(0, max_shingle_id)
rand_set.add(rand_num)
# if rand_num not added, it means that it already exists in set
# Try again.
if len(rand_set) - 1 == hash_num:
hash_num += 1
return list(rand_set)
if __name__ == '__main__':
if len(sys.argv[1:]) == 0 or sys.argv[1] in ["--help", "-h"]:
print("minhash.py -- calculate the minhash of two files")
print("USAGE:")
print()
print(" python3 minhash.py file1 file2")
print()
print("file1 - file to compare")
print("file2 - file to compare")
else:
str_one, str_two = [b'',b'']
with open(sys.argv[1], 'rb+') as f:
str_one = f.read()
with open(sys.argv[2], 'rb+') as f:
str_two = f.read()
sys.stdout.write(calculate(str_one, str_two))